mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 21:20:37 +00:00
Compare commits
388 Commits
problame/h
...
release-pr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
03666a1f37 | ||
|
|
9c92242ca0 | ||
|
|
a354071dd0 | ||
|
|
758680d4f8 | ||
|
|
1738fd0a96 | ||
|
|
87b7edfc72 | ||
|
|
def05700d5 | ||
|
|
b547681e08 | ||
|
|
0fd211537b | ||
|
|
a83bd4e81c | ||
|
|
ecdad5e6d5 | ||
|
|
d028929945 | ||
|
|
7b0e3db868 | ||
|
|
088eb72dd7 | ||
|
|
d550e3f626 | ||
|
|
8c6b41daf5 | ||
|
|
bbb050459b | ||
|
|
cab498c787 | ||
|
|
6359342ffb | ||
|
|
13285c2a5e | ||
|
|
33790d14a3 | ||
|
|
709b8cd371 | ||
|
|
1c9bbf1a92 | ||
|
|
16163fb850 | ||
|
|
73ccc2b08c | ||
|
|
c719be6474 | ||
|
|
718645e56c | ||
|
|
fbc8c36983 | ||
|
|
5519e42612 | ||
|
|
4157eaf4c5 | ||
|
|
60241127e2 | ||
|
|
f7d5322e8b | ||
|
|
41bb9c5280 | ||
|
|
69c0d61c5c | ||
|
|
63cb8ce975 | ||
|
|
907e4aa3c4 | ||
|
|
0a2a84b766 | ||
|
|
85b12ddd52 | ||
|
|
dd76f1eeee | ||
|
|
8963ac85f9 | ||
|
|
4a488b3e24 | ||
|
|
c4987b0b13 | ||
|
|
84b4821118 | ||
|
|
32ba9811f9 | ||
|
|
a0cd64c4d3 | ||
|
|
84687b743d | ||
|
|
b6f93dcec9 | ||
|
|
4f6c594973 | ||
|
|
a750c14735 | ||
|
|
9ce0dd4e55 | ||
|
|
0e1a336607 | ||
|
|
7fc2912d06 | ||
|
|
fdf231c237 | ||
|
|
1e08b5dccc | ||
|
|
030810ed3e | ||
|
|
62b74bdc2c | ||
|
|
8b7e9ed820 | ||
|
|
5dad89acd4 | ||
|
|
547b2d2827 | ||
|
|
93f29a0065 | ||
|
|
4f36494615 | ||
|
|
0a550f3e7d | ||
|
|
4bb9554e4a | ||
|
|
008616cfe6 | ||
|
|
e61ec94fbc | ||
|
|
e5152551ad | ||
|
|
b0822a5499 | ||
|
|
1fb6ab59e8 | ||
|
|
e16439400d | ||
|
|
e401f66698 | ||
|
|
2fa461b668 | ||
|
|
03d90bc0b3 | ||
|
|
268bc890ea | ||
|
|
8a6ee79f6f | ||
|
|
9052c32b46 | ||
|
|
995e729ebe | ||
|
|
76077e1ddf | ||
|
|
0467d88f06 | ||
|
|
f5eec194e7 | ||
|
|
7e00be391d | ||
|
|
d56599df2a | ||
|
|
9d9aab3680 | ||
|
|
a202b1b5cc | ||
|
|
90f731f3b1 | ||
|
|
7736b748d3 | ||
|
|
9c23333cb3 | ||
|
|
66a99009ba | ||
|
|
5d4c57491f | ||
|
|
73935ea3a2 | ||
|
|
32e595d4dd | ||
|
|
b0d69acb07 | ||
|
|
98355a419a | ||
|
|
cfb03d6cf0 | ||
|
|
d81ef3f962 | ||
|
|
5d62c67e75 | ||
|
|
53d53d5b1e | ||
|
|
29fe6ea47a | ||
|
|
640327ccb3 | ||
|
|
7cf0f6b37e | ||
|
|
03c2c569be | ||
|
|
eff6d4538a | ||
|
|
5ef7782e9c | ||
|
|
73101db8c4 | ||
|
|
bccdfc6d39 | ||
|
|
99595813bb | ||
|
|
fe07b54758 | ||
|
|
a42d173e7b | ||
|
|
e07f689238 | ||
|
|
7831eddc88 | ||
|
|
943b1bc80c | ||
|
|
95a184e9b7 | ||
|
|
3fa17e9d17 | ||
|
|
55e0fd9789 | ||
|
|
2a88889f44 | ||
|
|
5bad8126dc | ||
|
|
27bc242085 | ||
|
|
192b49cc6d | ||
|
|
e1b60f3693 | ||
|
|
2804f5323b | ||
|
|
676adc6b32 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
47
.github/workflows/build_and_test.yml
vendored
47
.github/workflows/build_and_test.yml
vendored
@@ -728,6 +728,30 @@ jobs:
|
||||
tags: |
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
|
||||
- name: Build compute-tools image
|
||||
# compute-tools are Postgres independent, so build it only once
|
||||
# We pick 16, because that builds on debian 11 with older glibc (and is
|
||||
# thus compatible with newer glibc), rather than 17 on Debian 12, as
|
||||
# that isn't guaranteed to be compatible with Debian 11
|
||||
if: matrix.version.pg == 'v16'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
target: compute-tools-image
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
|
||||
DEBIAN_VERSION=${{ matrix.version.debian }}
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: compute/compute-node.Dockerfile
|
||||
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
|
||||
tags: |
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
|
||||
|
||||
compute-node-image:
|
||||
needs: [ compute-node-image-arch, tag ]
|
||||
permissions:
|
||||
@@ -770,6 +794,14 @@ jobs:
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
|
||||
- name: Create multi-arch compute-tools image
|
||||
if: matrix.version.pg == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||
-t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
@@ -785,6 +817,12 @@ jobs:
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
- name: Push multi-arch compute-tools image to ECR
|
||||
if: matrix.version.pg == 'v16'
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
|
||||
neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
runs-on: [ self-hosted, large ]
|
||||
@@ -963,6 +1001,9 @@ jobs:
|
||||
docker buildx imagetools create -t $repo/neon:latest \
|
||||
$repo/neon:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
docker buildx imagetools create -t $repo/compute-tools:latest \
|
||||
$repo/compute-tools:${{ needs.tag.outputs.build-tag }}
|
||||
|
||||
for version in ${VERSIONS}; do
|
||||
docker buildx imagetools create -t $repo/compute-node-${version}:latest \
|
||||
$repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
|
||||
@@ -991,7 +1032,7 @@ jobs:
|
||||
- name: Copy all images to prod ECR
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
|
||||
run: |
|
||||
for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
|
||||
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
|
||||
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
|
||||
done
|
||||
@@ -1003,7 +1044,7 @@ jobs:
|
||||
with:
|
||||
client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
|
||||
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
|
||||
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
|
||||
registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
|
||||
subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
@@ -1015,7 +1056,7 @@ jobs:
|
||||
with:
|
||||
client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
|
||||
image_tag: ${{ needs.tag.outputs.build-tag }}
|
||||
images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
|
||||
images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
|
||||
registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
|
||||
subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
|
||||
tenant_id: ${{ vars.AZURE_TENANT_ID }}
|
||||
|
||||
272
Cargo.lock
generated
272
Cargo.lock
generated
@@ -718,13 +718,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "axum"
|
||||
version = "0.7.9"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
|
||||
checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"axum-core",
|
||||
"base64 0.22.1",
|
||||
"base64 0.21.1",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.1.0",
|
||||
@@ -746,8 +746,8 @@ dependencies = [
|
||||
"sha1",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tokio",
|
||||
"tokio-tungstenite 0.24.0",
|
||||
"tower 0.5.2",
|
||||
"tokio-tungstenite",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -1267,7 +1267,6 @@ dependencies = [
|
||||
"aws-config",
|
||||
"aws-sdk-kms",
|
||||
"aws-sdk-s3",
|
||||
"axum",
|
||||
"base64 0.13.1",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -1278,7 +1277,7 @@ dependencies = [
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"http 1.1.0",
|
||||
"hyper 0.14.30",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"notify",
|
||||
@@ -1304,8 +1303,6 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tower 0.5.2",
|
||||
"tower-http",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -1605,32 +1602,6 @@ dependencies = [
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "curve25519-dalek"
|
||||
version = "4.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"curve25519-dalek-derive",
|
||||
"digest",
|
||||
"fiat-crypto",
|
||||
"rustc_version",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "curve25519-dalek-derive"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.20.1"
|
||||
@@ -1679,20 +1650,6 @@ dependencies = [
|
||||
"parking_lot_core 0.9.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dashmap"
|
||||
version = "6.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"hashbrown 0.14.5",
|
||||
"lock_api",
|
||||
"once_cell",
|
||||
"parking_lot_core 0.9.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "data-encoding"
|
||||
version = "2.4.0"
|
||||
@@ -1901,28 +1858,6 @@ dependencies = [
|
||||
"spki 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ed25519"
|
||||
version = "2.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
|
||||
dependencies = [
|
||||
"signature 2.2.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ed25519-dalek"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
|
||||
dependencies = [
|
||||
"curve25519-dalek",
|
||||
"ed25519",
|
||||
"rand_core 0.6.4",
|
||||
"sha2",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.8.1"
|
||||
@@ -2014,15 +1949,6 @@ dependencies = [
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_filter"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
|
||||
dependencies = [
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.10.2"
|
||||
@@ -2036,16 +1962,6 @@ dependencies = [
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d"
|
||||
dependencies = [
|
||||
"env_filter",
|
||||
"log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equator"
|
||||
version = "0.2.2"
|
||||
@@ -2161,12 +2077,6 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fiat-crypto"
|
||||
version = "0.2.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
|
||||
|
||||
[[package]]
|
||||
name = "filetime"
|
||||
version = "0.2.22"
|
||||
@@ -2810,7 +2720,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"socket2",
|
||||
"tokio",
|
||||
"tower 0.4.13",
|
||||
"tower",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
@@ -3035,28 +2945,6 @@ dependencies = [
|
||||
"str_stack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inferno"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"clap",
|
||||
"crossbeam-channel",
|
||||
"crossbeam-utils",
|
||||
"dashmap 6.1.0",
|
||||
"env_logger 0.11.2",
|
||||
"indexmap 2.0.1",
|
||||
"itoa",
|
||||
"log",
|
||||
"num-format",
|
||||
"once_cell",
|
||||
"quick-xml 0.37.1",
|
||||
"rgb",
|
||||
"str_stack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify"
|
||||
version = "0.9.6"
|
||||
@@ -3264,7 +3152,7 @@ version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
|
||||
dependencies = [
|
||||
"dashmap 5.5.0",
|
||||
"dashmap",
|
||||
"hashbrown 0.13.2",
|
||||
]
|
||||
|
||||
@@ -3372,9 +3260,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
|
||||
|
||||
[[package]]
|
||||
name = "matchit"
|
||||
version = "0.8.4"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
|
||||
checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"
|
||||
|
||||
[[package]]
|
||||
name = "md-5"
|
||||
@@ -3802,23 +3690,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.27.1"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7"
|
||||
checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-http"
|
||||
version = "0.27.0"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80"
|
||||
checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -3829,9 +3717,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.27.0"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76"
|
||||
checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-core",
|
||||
@@ -3847,9 +3735,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "0.27.0"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
|
||||
checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
@@ -3859,21 +3747,22 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-semantic-conventions"
|
||||
version = "0.27.0"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52"
|
||||
checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.27.1"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8"
|
||||
checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-channel",
|
||||
"futures-executor",
|
||||
"futures-util",
|
||||
"glob",
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"percent-encoding",
|
||||
"rand 0.8.5",
|
||||
@@ -3881,7 +3770,6 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4044,7 +3932,6 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"postgres_initdb",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
@@ -4531,7 +4418,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"env_logger 0.10.2",
|
||||
"env_logger",
|
||||
"log",
|
||||
"memoffset 0.9.0",
|
||||
"once_cell",
|
||||
@@ -4572,7 +4459,7 @@ dependencies = [
|
||||
"cfg-if",
|
||||
"criterion",
|
||||
"findshlibs",
|
||||
"inferno 0.11.21",
|
||||
"inferno",
|
||||
"libc",
|
||||
"log",
|
||||
"nix 0.26.4",
|
||||
@@ -4798,10 +4685,9 @@ dependencies = [
|
||||
"clap",
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"dashmap 5.5.0",
|
||||
"dashmap",
|
||||
"ecdsa 0.16.9",
|
||||
"ed25519-dalek",
|
||||
"env_logger 0.10.2",
|
||||
"env_logger",
|
||||
"fallible-iterator",
|
||||
"flate2",
|
||||
"framed-websockets",
|
||||
@@ -4872,7 +4758,7 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-postgres2",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-tungstenite 0.21.0",
|
||||
"tokio-tungstenite",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
@@ -4908,15 +4794,6 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.37.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.37"
|
||||
@@ -5301,15 +5178,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.5.5"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2"
|
||||
checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"getrandom 0.2.11",
|
||||
"http 1.1.0",
|
||||
"matchit 0.8.4",
|
||||
"matchit 0.8.2",
|
||||
"opentelemetry",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
@@ -6923,19 +6800,7 @@ dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.21.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tungstenite"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"log",
|
||||
"tokio",
|
||||
"tungstenite 0.24.0",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7016,7 +6881,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
"tower 0.4.13",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -7056,50 +6921,17 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"pin-project-lite",
|
||||
"sync_wrapper 1.0.1",
|
||||
"tokio",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-http"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"bytes",
|
||||
"http 1.1.0",
|
||||
"http-body 1.0.0",
|
||||
"pin-project-lite",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-layer"
|
||||
version = "0.3.3"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
|
||||
checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
|
||||
|
||||
[[package]]
|
||||
name = "tower-service"
|
||||
version = "0.3.3"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
|
||||
checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
@@ -7168,9 +7000,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.28.0"
|
||||
version = "0.27.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053"
|
||||
checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
@@ -7254,24 +7086,6 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tungstenite"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"data-encoding",
|
||||
"http 1.1.0",
|
||||
"httparse",
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.3"
|
||||
@@ -7439,7 +7253,6 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"inferno 0.12.0",
|
||||
"itertools 0.10.5",
|
||||
"jemalloc_pprof",
|
||||
"jsonwebtoken",
|
||||
@@ -7543,7 +7356,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"camino-tempfile",
|
||||
"clap",
|
||||
"env_logger 0.10.2",
|
||||
"env_logger",
|
||||
"log",
|
||||
"postgres",
|
||||
"postgres_ffi",
|
||||
@@ -8054,8 +7867,7 @@ dependencies = [
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tonic",
|
||||
"tower 0.4.13",
|
||||
"tower 0.5.2",
|
||||
"tower",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"url",
|
||||
|
||||
19
Cargo.toml
19
Cargo.toml
@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
|
||||
aws-credential-types = "1.2.0"
|
||||
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
|
||||
aws-types = "1.3"
|
||||
axum = { version = "0.7.9", features = ["ws"] }
|
||||
axum = { version = "0.7.5", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.70"
|
||||
@@ -110,7 +110,6 @@ hyper-util = "0.1"
|
||||
tokio-tungstenite = "0.21.0"
|
||||
indexmap = "2"
|
||||
indoc = "2"
|
||||
inferno = "0.12.0"
|
||||
ipnet = "2.10.0"
|
||||
itertools = "0.10"
|
||||
itoa = "1.0.11"
|
||||
@@ -127,10 +126,10 @@ notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.27"
|
||||
opentelemetry_sdk = "0.27"
|
||||
opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.27"
|
||||
opentelemetry = "0.26"
|
||||
opentelemetry_sdk = "0.26"
|
||||
opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.26"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
@@ -144,7 +143,7 @@ rand = "0.8"
|
||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
|
||||
reqwest-middleware = "0.4"
|
||||
reqwest-retry = "0.7"
|
||||
routerify = "3"
|
||||
@@ -188,12 +187,10 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
|
||||
tower = { version = "0.5.2", default-features = false }
|
||||
tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
|
||||
tower-service = "0.3.3"
|
||||
tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2"
|
||||
tracing-opentelemetry = "0.28"
|
||||
tracing-opentelemetry = "0.27"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
|
||||
@@ -71,7 +71,6 @@ RUN set -e \
|
||||
ca-certificates \
|
||||
# System postgres for use with client libraries (e.g. in storage controller)
|
||||
postgresql-15 \
|
||||
openssl \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
|
||||
&& useradd -d /data neon \
|
||||
&& chown -R neon:neon /data
|
||||
@@ -104,6 +103,11 @@ RUN mkdir -p /data/.neon/ && \
|
||||
> /data/.neon/pageserver.toml && \
|
||||
chown -R neon:neon /data/.neon
|
||||
|
||||
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
|
||||
# that want a particular postgres version will select it explicitly: this is just a default.
|
||||
ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
|
||||
|
||||
|
||||
VOLUME ["/data"]
|
||||
USER neon
|
||||
EXPOSE 6400
|
||||
|
||||
3
Makefile
3
Makefile
@@ -3,6 +3,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
# Where to install Postgres, default is ./pg_install, maybe useful for package managers
|
||||
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
|
||||
|
||||
OPENSSL_PREFIX_DIR := /usr/local/openssl
|
||||
ICU_PREFIX_DIR := /usr/local/icu
|
||||
|
||||
#
|
||||
@@ -25,9 +26,11 @@ endif
|
||||
ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
|
||||
# Exclude static build openssl, icu for local build (MacOS, Linux)
|
||||
# Only keep for build type release and debug
|
||||
PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
|
||||
PG_CONFIGURE_OPTS += --with-icu
|
||||
PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
|
||||
PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
|
||||
PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
|
||||
endif
|
||||
|
||||
UNAME_S := $(shell uname -s)
|
||||
|
||||
@@ -115,7 +115,7 @@ RUN set -e \
|
||||
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
ENV SQL_EXPORTER_VERSION=0.17.0
|
||||
ENV SQL_EXPORTER_VERSION=0.16.0
|
||||
RUN curl -fsSL \
|
||||
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
|
||||
--output sql_exporter.tar.gz \
|
||||
@@ -190,6 +190,21 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
|
||||
&& make install \
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Compile and install the static OpenSSL library
|
||||
ENV OPENSSL_VERSION=1.1.1w
|
||||
ENV OPENSSL_PREFIX=/usr/local/openssl
|
||||
RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
|
||||
cd /tmp && \
|
||||
tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
cd /tmp/openssl-${OPENSSL_VERSION} && \
|
||||
./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \
|
||||
make -j "$(nproc)" && \
|
||||
make install && \
|
||||
cd /tmp && \
|
||||
rm -rf /tmp/openssl-${OPENSSL_VERSION}
|
||||
|
||||
# Use the same version of libicu as the compute nodes so that
|
||||
# clusters created using inidb on pageserver can be used by computes.
|
||||
#
|
||||
@@ -243,7 +258,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.84.0
|
||||
ENV RUSTC_VERSION=1.83.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
|
||||
@@ -104,18 +104,16 @@ RUN cd postgres && \
|
||||
esac; \
|
||||
done;
|
||||
|
||||
# Set PATH for all the subsequent build steps
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "postgis-build"
|
||||
# Build PostGIS from the upstream PostGIS mirror.
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS postgis-build
|
||||
FROM build-deps AS postgis-build
|
||||
ARG DEBIAN_VERSION
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y \
|
||||
gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
|
||||
@@ -153,6 +151,8 @@ RUN case "${DEBIAN_VERSION}" in \
|
||||
DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
ninja clean && cp -R /sfcgal/* /
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
# Postgis 3.5.0 supports v17
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
@@ -170,6 +170,7 @@ RUN case "${PG_VERSION}" in \
|
||||
wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
|
||||
echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
|
||||
mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -219,7 +220,11 @@ RUN case "${PG_VERSION}" in \
|
||||
cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
ninja -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
ninja -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -227,8 +232,9 @@ RUN case "${PG_VERSION}" in \
|
||||
# Build plv8
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS plv8-build
|
||||
FROM build-deps AS plv8-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
|
||||
|
||||
@@ -263,6 +269,7 @@ RUN case "${PG_VERSION}" in \
|
||||
# generate and copy upgrade scripts
|
||||
mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
|
||||
cp upgrade/* /usr/local/pgsql/share/extension/ && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
|
||||
@@ -289,8 +296,9 @@ RUN case "${PG_VERSION}" in \
|
||||
# Build h3_pg
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS h3-pg-build
|
||||
FROM build-deps AS h3-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v4.1.0 - Jan 18, 2023
|
||||
@@ -311,6 +319,7 @@ RUN mkdir -p /h3/usr/ && \
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
|
||||
echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
|
||||
mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
|
||||
@@ -322,16 +331,17 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
|
||||
# compile unit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS unit-pg-build
|
||||
FROM build-deps AS unit-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release 7.9 - Sep 15, 2024
|
||||
RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
|
||||
echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
|
||||
mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
# unit extension's "create extension" script relies on absolute install path to fill some reference tables.
|
||||
# We move the extension from '/usr/local/pgsql/' to '/usr/local/' after it is build. So we need to adjust the path.
|
||||
# This one-liner removes pgsql/ part of the path.
|
||||
@@ -345,8 +355,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
|
||||
# compile pgvector extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS vector-pg-build
|
||||
FROM build-deps AS vector-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/pgvector.patch /pgvector.patch
|
||||
|
||||
@@ -360,8 +371,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
|
||||
echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -370,15 +381,16 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
|
||||
# compile pgjwt extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pgjwt-pg-build
|
||||
FROM build-deps AS pgjwt-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# doesn't use releases, last commit f3d82fd - Mar 2, 2023
|
||||
RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
|
||||
echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -387,16 +399,17 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
|
||||
# compile hypopg extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS hypopg-pg-build
|
||||
FROM build-deps AS hypopg-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# HypoPG 1.4.1 supports v17
|
||||
# last release 1.4.1 - Apr 28, 2024
|
||||
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
|
||||
echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
|
||||
mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -405,16 +418,17 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
|
||||
# compile pg_hashids extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-hashids-pg-build
|
||||
FROM build-deps AS pg-hashids-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.2.1 -Jan 12, 2018
|
||||
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
|
||||
echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -423,8 +437,9 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
|
||||
# compile rum extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS rum-pg-build
|
||||
FROM build-deps AS rum-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/rum.patch /rum.patch
|
||||
|
||||
@@ -435,8 +450,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
|
||||
echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /rum.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -445,16 +460,17 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
|
||||
# compile pgTAP extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pgtap-pg-build
|
||||
FROM build-deps AS pgtap-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# pgtap 1.3.3 supports v17
|
||||
# last release v1.3.3 - Apr 8, 2024
|
||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
|
||||
echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -463,16 +479,17 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
|
||||
# compile ip4r extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS ip4r-pg-build
|
||||
FROM build-deps AS ip4r-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v2.4.2 - Jul 29, 2023
|
||||
RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
|
||||
echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
|
||||
mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -481,16 +498,17 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
|
||||
# compile Prefix extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS prefix-pg-build
|
||||
FROM build-deps AS prefix-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.2.10 - Jul 5, 2023
|
||||
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
|
||||
echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
|
||||
mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -499,16 +517,17 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
|
||||
# compile hll extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS hll-pg-build
|
||||
FROM build-deps AS hll-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v2.18 - Aug 29, 2023
|
||||
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
|
||||
echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
|
||||
mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -517,16 +536,17 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
|
||||
# compile plpgsql_check extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS plpgsql-check-pg-build
|
||||
FROM build-deps AS plpgsql-check-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# plpgsql_check v2.7.11 supports v17
|
||||
# last release v2.7.11 - Sep 16, 2024
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
|
||||
echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -535,8 +555,11 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
|
||||
# compile timescaledb extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS timescaledb-pg-build
|
||||
FROM build-deps AS timescaledb-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
@@ -567,8 +590,11 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_hint_plan extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-hint-plan-pg-build
|
||||
FROM build-deps AS pg-hint-plan-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
# version-specific, has separate releases for each version
|
||||
RUN case "${PG_VERSION}" in \
|
||||
@@ -606,12 +632,14 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_cron extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-cron-pg-build
|
||||
FROM build-deps AS pg-cron-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# This is an experimental extension that we do not support on prod yet.
|
||||
# !Do not remove!
|
||||
# We set it in shared_preload_libraries and computes will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
@@ -625,8 +653,9 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
|
||||
# compile rdkit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS rdkit-pg-build
|
||||
FROM build-deps AS rdkit-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y \
|
||||
@@ -644,13 +673,7 @@ RUN apt update && \
|
||||
# Use new version only for v17
|
||||
# because Release_2024_09_1 has some backward incompatible changes
|
||||
# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
|
||||
|
||||
# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
|
||||
# pg_config. For some reason the rdkit cmake script doesn't work with just that,
|
||||
# however. By also adding /usr/local/pgsql, it works, which is weird because there
|
||||
# are no executables in that directory.
|
||||
ENV PATH="/usr/local/pgsql:$PATH"
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
export RDKIT_VERSION=Release_2024_09_1 \
|
||||
@@ -703,11 +726,13 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_uuidv7 extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-uuidv7-pg-build
|
||||
FROM build-deps AS pg-uuidv7-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v1.6.0 - Oct 9, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
@@ -721,11 +746,13 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
|
||||
# compile pg_roaringbitmap extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-roaringbitmap-pg-build
|
||||
FROM build-deps AS pg-roaringbitmap-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# not version-specific
|
||||
# last release v0.5.4 - Jun 28, 2022
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
|
||||
echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
|
||||
@@ -739,14 +766,16 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
|
||||
# compile pg_semver extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-semver-pg-build
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# Release 0.40.0 breaks backward compatibility with previous versions
|
||||
# see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
|
||||
# Use new version only for v17
|
||||
#
|
||||
# last release v0.40.0 - Jul 22, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v17") \
|
||||
export SEMVER_VERSION=0.40.0 \
|
||||
@@ -773,11 +802,13 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile pg_embedding extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-embedding-pg-build
|
||||
FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# This is our extension, support stopped in favor of pgvector
|
||||
# TODO: deprecate it
|
||||
ARG PG_VERSION
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export PG_EMBEDDING_VERSION=0.3.5 \
|
||||
@@ -798,19 +829,26 @@ RUN case "${PG_VERSION}" in \
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-anon-pg-build
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# This is an experimental extension, never got to real production.
|
||||
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -818,8 +856,9 @@ RUN case "${PG_VERSION}" in "v17") \
|
||||
# This layer is used to build `pgrx` deps
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS rust-extensions-build
|
||||
FROM build-deps AS rust-extensions-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
|
||||
@@ -827,7 +866,7 @@ RUN apt update && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -854,8 +893,9 @@ USER root
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS rust-extensions-build-pgrx12
|
||||
FROM build-deps AS rust-extensions-build-pgrx12
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
|
||||
@@ -863,7 +903,7 @@ RUN apt update && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:$PATH"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
@@ -936,9 +976,22 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
|
||||
ARG PG_VERSION
|
||||
# version 0.3.3 supports v17
|
||||
# last release v0.3.3 - Oct 16, 2024
|
||||
RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
|
||||
echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
|
||||
#
|
||||
# there were no breaking changes
|
||||
# so we can use the same version for all postgres versions
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15" | "v16" | "v17") \
|
||||
export PG_JSONSCHEMA_VERSION=0.3.3 \
|
||||
export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
|
||||
;; \
|
||||
*) \
|
||||
echo "unexpected PostgreSQL version" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
|
||||
echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
|
||||
# see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
|
||||
# `unsafe-postgres` feature allows to build pgx extensions
|
||||
@@ -959,9 +1012,22 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
|
||||
FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# version 1.5.9 supports v17
|
||||
# last release v1.5.9 - Oct 16, 2024
|
||||
RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
|
||||
#
|
||||
# there were no breaking changes
|
||||
# so we can use the same version for all postgres versions
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15" | "v16" | "v17") \
|
||||
export PG_GRAPHQL_VERSION=1.5.9 \
|
||||
export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
|
||||
;; \
|
||||
*) \
|
||||
echo "unexpected PostgreSQL version" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
@@ -1025,8 +1091,8 @@ ARG PG_VERSION
|
||||
# NOTE: local_proxy depends on the version of pg_session_jwt
|
||||
# Do not update without approve from proxy team
|
||||
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release
|
||||
@@ -1038,11 +1104,13 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM pg-build AS wal2json-pg-build
|
||||
FROM build-deps AS wal2json-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# wal2json wal2json_2_6 supports v17
|
||||
# last release wal2json_2_6 - Apr 25, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
|
||||
echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1055,11 +1123,13 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
|
||||
# compile pg_ivm extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-ivm-build
|
||||
FROM build-deps AS pg-ivm-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# pg_ivm v1.9 supports v17
|
||||
# last release v1.9 - Jul 31
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
|
||||
echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1073,11 +1143,13 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
|
||||
# compile pg_partman extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM pg-build AS pg-partman-build
|
||||
FROM build-deps AS pg-partman-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# should support v17 https://github.com/pgpartman/pg_partman/discussions/693
|
||||
# last release 5.1.0 Apr 2, 2024
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
|
||||
echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
@@ -1093,12 +1165,24 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
|
||||
#########################################################################################
|
||||
FROM rust-extensions-build AS pg-mooncake-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
|
||||
echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
|
||||
make release -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
# The topmost commit in the `neon` branch at the time of writing this
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
|
||||
ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
'v14') \
|
||||
echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
|
||||
esac && \
|
||||
git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
|
||||
cd pg_mooncake-src && \
|
||||
git checkout "${PG_MOONCAKE_VERSION}" && \
|
||||
git submodule update --init --depth 1 --recursive && \
|
||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -1108,8 +1192,11 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM pg-build AS pg-repack-build
|
||||
FROM build-deps AS pg-repack-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
|
||||
echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
|
||||
@@ -1180,6 +1267,20 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_rmgr \
|
||||
-s install && \
|
||||
case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
;; \
|
||||
"v16" | "v17") \
|
||||
echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
|
||||
;; \
|
||||
*) \
|
||||
echo "unexpected PostgreSQL version" && exit 1 \
|
||||
;; \
|
||||
esac && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/hnsw \
|
||||
-s install
|
||||
|
||||
#########################################################################################
|
||||
@@ -1196,6 +1297,17 @@ USER nonroot
|
||||
COPY --chown=nonroot . .
|
||||
RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Final compute-tools image
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
|
||||
|
||||
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgbouncer"
|
||||
@@ -1232,11 +1344,11 @@ RUN set -e \
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
|
||||
# Keep the version the same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
|
||||
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -15,7 +15,6 @@ aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
aws-sdk-kms.workspace = true
|
||||
anyhow.workspace = true
|
||||
axum = { workspace = true, features = [] }
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
cfg-if.workspace = true
|
||||
@@ -23,7 +22,7 @@ clap.workspace = true
|
||||
fail.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
http.workspace = true
|
||||
hyper0 = { workspace = true, features = ["full"] }
|
||||
metrics.workspace = true
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
@@ -38,8 +37,6 @@ serde_with.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
tar.workspace = true
|
||||
tower.workspace = true
|
||||
tower-http.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
|
||||
@@ -60,7 +60,7 @@ use compute_tools::compute::{
|
||||
};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version_string;
|
||||
use compute_tools::http::launch_http_server;
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
@@ -111,6 +111,11 @@ fn main() -> Result<()> {
|
||||
fn init() -> Result<(String, clap::ArgMatches)> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
opentelemetry::global::set_error_handler(|err| {
|
||||
tracing::info!("OpenTelemetry error: {err}");
|
||||
})
|
||||
.expect("global error handler lock poisoned");
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
thread::spawn(move || {
|
||||
for sig in signals.forever() {
|
||||
@@ -488,10 +493,7 @@ fn start_postgres(
|
||||
let mut pg = None;
|
||||
if !prestartup_failed {
|
||||
pg = match compute.start_compute() {
|
||||
Ok(pg) => {
|
||||
info!(postmaster_pid = %pg.0.id(), "Postgres was started");
|
||||
Some(pg)
|
||||
}
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
compute.set_failed_status(err);
|
||||
@@ -589,8 +591,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
let mut exit_code = None;
|
||||
if let Some((mut pg, logs_handle)) = pg {
|
||||
info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
|
||||
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
//!
|
||||
//! # Local Testing
|
||||
//!
|
||||
//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
|
||||
//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
|
||||
//! - Build the image with the following command:
|
||||
//!
|
||||
//! ```bash
|
||||
|
||||
@@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<Cat
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum SchemaDumpError {
|
||||
#[error("database does not exist")]
|
||||
#[error("Database does not exist.")]
|
||||
DatabaseDoesNotExist,
|
||||
#[error("failed to execute pg_dump")]
|
||||
#[error("Failed to execute pg_dump.")]
|
||||
IO(#[from] std::io::Error),
|
||||
#[error("unexpected I/O error")]
|
||||
#[error("Unexpected error.")]
|
||||
Unexpected,
|
||||
}
|
||||
|
||||
|
||||
606
compute_tools/src/http/api.rs
Normal file
606
compute_tools/src/http/api.rs
Normal file
@@ -0,0 +1,606 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::IpAddr;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::catalog::SchemaDumpError;
|
||||
use crate::catalog::{get_database_schema, get_dbs_and_roles};
|
||||
use crate::compute::forward_termination_signal;
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use crate::installed_extensions;
|
||||
use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
|
||||
use compute_api::responses::{
|
||||
ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
|
||||
SetRoleGrantsResponse,
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::Encoder;
|
||||
use metrics::TextEncoder;
|
||||
use tokio::task;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::error::ApiError;
|
||||
use utils::http::request::must_get_query_param;
|
||||
|
||||
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
||||
ComputeStatusResponse {
|
||||
start_time: state.start_time,
|
||||
tenant: state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.map(|pspec| pspec.tenant_id.to_string()),
|
||||
timeline: state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.map(|pspec| pspec.timeline_id.to_string()),
|
||||
status: state.status,
|
||||
last_active: state.last_active,
|
||||
error: state.error.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
|
||||
//
|
||||
// NOTE: The URI path is currently included in traces. That's OK because
|
||||
// it doesn't contain any variable parts or sensitive information. But
|
||||
// please keep that in mind if you change the routing here.
|
||||
//
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Serialized compute state.
|
||||
(&Method::GET, "/status") => {
|
||||
debug!("serving /status GET request");
|
||||
let state = compute.state.lock().unwrap();
|
||||
let status_response = status_response_from_state(&state);
|
||||
Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
|
||||
}
|
||||
|
||||
// Startup metrics in JSON format. Keep /metrics reserved for a possible
|
||||
// future use for Prometheus metrics format.
|
||||
(&Method::GET, "/metrics.json") => {
|
||||
info!("serving /metrics.json GET request");
|
||||
let metrics = compute.state.lock().unwrap().metrics.clone();
|
||||
Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
|
||||
}
|
||||
|
||||
// Prometheus metrics
|
||||
(&Method::GET, "/metrics") => {
|
||||
debug!("serving /metrics GET request");
|
||||
|
||||
// When we call TextEncoder::encode() below, it will immediately
|
||||
// return an error if a metric family has no metrics, so we need to
|
||||
// preemptively filter out metric families with no metrics.
|
||||
let metrics = installed_extensions::collect()
|
||||
.into_iter()
|
||||
.filter(|m| !m.get_metric().is_empty())
|
||||
.collect::<Vec<MetricFamily>>();
|
||||
|
||||
let encoder = TextEncoder::new();
|
||||
let mut buffer = vec![];
|
||||
|
||||
if let Err(err) = encoder.encode(&metrics, &mut buffer) {
|
||||
let msg = format!("error handling /metrics request: {err}");
|
||||
error!(msg);
|
||||
return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
|
||||
}
|
||||
|
||||
match Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE, encoder.format_type())
|
||||
.body(Body::from(buffer))
|
||||
{
|
||||
Ok(response) => response,
|
||||
Err(err) => {
|
||||
let msg = format!("error handling /metrics request: {err}");
|
||||
error!(msg);
|
||||
render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
// Collect Postgres current usage insights
|
||||
(&Method::GET, "/insights") => {
|
||||
info!("serving /insights GET request");
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!("compute is not running, current status: {:?}", status);
|
||||
error!(msg);
|
||||
return Response::new(Body::from(msg));
|
||||
}
|
||||
|
||||
let insights = compute.collect_insights().await;
|
||||
Response::new(Body::from(insights))
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
info!("serving /check_writability POST request");
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for check_writability request: {:?}",
|
||||
status
|
||||
);
|
||||
error!(msg);
|
||||
return Response::new(Body::from(msg));
|
||||
}
|
||||
|
||||
let res = crate::checker::check_writability(compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => {
|
||||
error!("check_writability failed: {}", e);
|
||||
Response::new(Body::from(e.to_string()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/extensions") => {
|
||||
info!("serving /extensions POST request");
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for extensions request: {:?}",
|
||||
status
|
||||
);
|
||||
error!(msg);
|
||||
return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
|
||||
}
|
||||
|
||||
let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
|
||||
let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
|
||||
let res = compute
|
||||
.install_extension(&request.extension, &request.database, request.version)
|
||||
.await;
|
||||
match res {
|
||||
Ok(version) => render_json(Body::from(
|
||||
serde_json::to_string(&ExtensionInstallResult {
|
||||
extension: request.extension,
|
||||
version,
|
||||
})
|
||||
.unwrap(),
|
||||
)),
|
||||
Err(e) => {
|
||||
error!("install_extension failed: {}", e);
|
||||
render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/info") => {
|
||||
let num_cpus = num_cpus::get_physical();
|
||||
info!("serving /info GET request. num_cpus: {}", num_cpus);
|
||||
Response::new(Body::from(
|
||||
serde_json::json!({
|
||||
"num_cpus": num_cpus,
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
// Accept spec in JSON format and request compute configuration. If
|
||||
// anything goes wrong after we set the compute status to `ConfigurationPending`
|
||||
// and update compute state with new spec, we basically leave compute
|
||||
// in the potentially wrong state. That said, it's control-plane's
|
||||
// responsibility to watch compute state after reconfiguration request
|
||||
// and to clean restart in case of errors.
|
||||
(&Method::POST, "/configure") => {
|
||||
info!("serving /configure POST request");
|
||||
match handle_configure_request(req, compute).await {
|
||||
Ok(msg) => Response::new(Body::from(msg)),
|
||||
Err((msg, code)) => {
|
||||
error!("error handling /configure request: {msg}");
|
||||
render_json_error(&msg, code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/terminate") => {
|
||||
info!("serving /terminate POST request");
|
||||
match handle_terminate_request(compute).await {
|
||||
Ok(()) => Response::new(Body::empty()),
|
||||
Err((msg, code)) => {
|
||||
error!("error handling /terminate request: {msg}");
|
||||
render_json_error(&msg, code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/dbs_and_roles") => {
|
||||
info!("serving /dbs_and_roles GET request",);
|
||||
match get_dbs_and_roles(compute).await {
|
||||
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
|
||||
Err(_) => {
|
||||
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/database_schema") => {
|
||||
let database = match must_get_query_param(&req, "database") {
|
||||
Err(e) => return e.into_response(),
|
||||
Ok(database) => database,
|
||||
};
|
||||
info!("serving /database_schema GET request with database: {database}",);
|
||||
match get_database_schema(compute, &database).await {
|
||||
Ok(res) => render_plain(Body::wrap_stream(res)),
|
||||
Err(SchemaDumpError::DatabaseDoesNotExist) => {
|
||||
render_json_error("database does not exist", StatusCode::NOT_FOUND)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("can't get schema dump: {}", e);
|
||||
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/grants") => {
|
||||
info!("serving /grants POST request");
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for set_role_grants request: {:?}",
|
||||
status
|
||||
);
|
||||
error!(msg);
|
||||
return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
|
||||
}
|
||||
|
||||
let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
|
||||
let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
|
||||
|
||||
let res = compute
|
||||
.set_role_grants(
|
||||
&request.database,
|
||||
&request.schema,
|
||||
&request.privileges,
|
||||
&request.role,
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(()) => render_json(Body::from(
|
||||
serde_json::to_string(&SetRoleGrantsResponse {
|
||||
database: request.database,
|
||||
schema: request.schema,
|
||||
role: request.role,
|
||||
privileges: request.privileges,
|
||||
})
|
||||
.unwrap(),
|
||||
)),
|
||||
Err(e) => render_json_error(
|
||||
&format!("could not grant role privileges to the schema: {e}"),
|
||||
// TODO: can we filter on role/schema not found errors
|
||||
// and return appropriate error code?
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
// get the list of installed extensions
|
||||
// currently only used in python tests
|
||||
// TODO: call it from cplane
|
||||
(&Method::GET, "/installed_extensions") => {
|
||||
info!("serving /installed_extensions GET request");
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for extensions request: {:?}",
|
||||
status
|
||||
);
|
||||
error!(msg);
|
||||
return Response::new(Body::from(msg));
|
||||
}
|
||||
|
||||
let conf = compute.get_conn_conf(None);
|
||||
let res =
|
||||
task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
match res {
|
||||
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
|
||||
Err(e) => render_json_error(
|
||||
&format!("could not get list of installed extensions: {}", e),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/failpoints") if cfg!(feature = "testing") => {
|
||||
match failpoints_handler(req, CancellationToken::new()).await {
|
||||
Ok(r) => r,
|
||||
Err(ApiError::BadRequest(e)) => {
|
||||
render_json_error(&e.to_string(), StatusCode::BAD_REQUEST)
|
||||
}
|
||||
Err(_) => {
|
||||
render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
|
||||
// don't even try to download extensions
|
||||
// if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
info!("no extensions remote storage configured");
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
|
||||
let mut is_library = false;
|
||||
if let Some(params) = req.uri().query() {
|
||||
info!("serving {:?} POST request with params: {}", route, params);
|
||||
if params == "is_library=true" {
|
||||
is_library = true;
|
||||
} else {
|
||||
let mut resp = Response::new(Body::from("Wrong request parameters"));
|
||||
*resp.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return resp;
|
||||
}
|
||||
}
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
|
||||
|
||||
// get ext_name and path from spec
|
||||
// don't lock compute_state for too long
|
||||
let ext = {
|
||||
let compute_state = compute.state.lock().unwrap();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
|
||||
// debug only
|
||||
info!("spec: {:?}", spec);
|
||||
|
||||
let remote_extensions = match spec.remote_extensions.as_ref() {
|
||||
Some(r) => r,
|
||||
None => {
|
||||
info!("no remote extensions spec was provided");
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
};
|
||||
|
||||
remote_extensions.get_ext(
|
||||
&filename,
|
||||
is_library,
|
||||
&compute.build_tag,
|
||||
&compute.pgversion,
|
||||
)
|
||||
};
|
||||
|
||||
match ext {
|
||||
Ok((ext_name, ext_path)) => {
|
||||
match compute.download_extension(ext_name, ext_path).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("extension download failed to find extension: {}", e);
|
||||
let mut resp = Response::new(Body::from("failed to find file"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||
not_found
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_configure_request(
|
||||
req: Request<Body>,
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<String, (String, StatusCode)> {
|
||||
if !compute.live_config_allowed {
|
||||
return Err((
|
||||
"live configuration is not allowed for this compute node".to_string(),
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
));
|
||||
}
|
||||
|
||||
let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
|
||||
let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
|
||||
if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
|
||||
let spec = request.spec;
|
||||
|
||||
let parsed_spec = match ParsedSpec::try_from(spec) {
|
||||
Ok(ps) => ps,
|
||||
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
|
||||
};
|
||||
|
||||
// XXX: wrap state update under lock in code blocks. Otherwise,
|
||||
// we will try to `Send` `mut state` into the spawned thread
|
||||
// bellow, which will cause error:
|
||||
// ```
|
||||
// error: future cannot be sent between threads safely
|
||||
// ```
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for configuration request: {:?}",
|
||||
state.status.clone()
|
||||
);
|
||||
return Err((msg, StatusCode::PRECONDITION_FAILED));
|
||||
}
|
||||
state.pspec = Some(parsed_spec);
|
||||
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
|
||||
drop(state);
|
||||
info!("set new spec and notified waiters");
|
||||
}
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Running.
|
||||
// This is needed to do not block the main pool of workers and
|
||||
// be able to serve other requests while some particular request
|
||||
// is waiting for compute to finish configuration.
|
||||
let c = compute.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Running {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become Running, current status: {:?}",
|
||||
state.status
|
||||
);
|
||||
|
||||
if state.status == ComputeStatus::Failed {
|
||||
let err = state.error.as_ref().map_or("unknown error", |x| x);
|
||||
let msg = format!("compute configuration failed: {:?}", err);
|
||||
return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
|
||||
// Return current compute state if everything went well.
|
||||
let state = compute.state.lock().unwrap().clone();
|
||||
let status_response = status_response_from_state(&state);
|
||||
Ok(serde_json::to_string(&status_response).unwrap())
|
||||
} else {
|
||||
Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
|
||||
}
|
||||
}
|
||||
|
||||
fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
let error = GenericAPIError {
|
||||
error: e.to_string(),
|
||||
};
|
||||
Response::builder()
|
||||
.status(status)
|
||||
.header(CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(serde_json::to_string(&error).unwrap()))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn render_json(body: Body) -> Response<Body> {
|
||||
Response::builder()
|
||||
.header(CONTENT_TYPE, "application/json")
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn render_plain(body: Body) -> Response<Body> {
|
||||
Response::builder()
|
||||
.header(CONTENT_TYPE, "text/plain")
|
||||
.body(body)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::Terminated {
|
||||
return Ok(());
|
||||
}
|
||||
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for termination request: {}",
|
||||
state.status
|
||||
);
|
||||
return Err((msg, StatusCode::PRECONDITION_FAILED));
|
||||
}
|
||||
state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
|
||||
drop(state);
|
||||
}
|
||||
|
||||
forward_termination_signal();
|
||||
info!("sent signal and notified waiters");
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Terminated.
|
||||
// This is needed to do not block the main pool of workers and
|
||||
// be able to serve other requests while some particular request
|
||||
// is waiting for compute to finish configuration.
|
||||
let c = compute.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Terminated {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become {}, current status: {:?}",
|
||||
ComputeStatus::Terminated,
|
||||
state.status
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
info!("terminated Postgres");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, state: Arc<ComputeNode>) {
|
||||
// this usually binds to both IPv4 and IPv6 on linux
|
||||
// see e.g. https://github.com/rust-lang/rust/pull/34440
|
||||
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
|
||||
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
let state = state.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
||||
let state = state.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(
|
||||
// NOTE: We include the URI path in the string. It
|
||||
// doesn't contain any variable parts or sensitive
|
||||
// information in this API.
|
||||
tracing_utils::http::tracing_handler(
|
||||
req,
|
||||
|req| routes(req, &state),
|
||||
OtelName::UriPath,
|
||||
)
|
||||
.await,
|
||||
)
|
||||
}
|
||||
}))
|
||||
}
|
||||
});
|
||||
|
||||
info!("starting HTTP server on {}", addr);
|
||||
|
||||
let server = Server::bind(&addr).serve(make_service);
|
||||
|
||||
// Run this server forever
|
||||
if let Err(e) = server.await {
|
||||
error!("server error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
.name("http-endpoint".into())
|
||||
.spawn(move || serve(port, state))?)
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::{
|
||||
async_trait,
|
||||
extract::{rejection::JsonRejection, FromRequest, Request},
|
||||
};
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::StatusCode;
|
||||
|
||||
/// Custom `Json` extractor, so that we can format errors into
|
||||
/// `JsonResponse<GenericAPIError>`.
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub(crate) struct Json<T>(pub T);
|
||||
|
||||
#[async_trait]
|
||||
impl<S, T> FromRequest<S> for Json<T>
|
||||
where
|
||||
axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
|
||||
S: Send + Sync,
|
||||
{
|
||||
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
|
||||
|
||||
async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
|
||||
match axum::Json::<T>::from_request(req, state).await {
|
||||
Ok(value) => Ok(Self(value.0)),
|
||||
Err(rejection) => Err((
|
||||
rejection.status(),
|
||||
axum::Json(GenericAPIError {
|
||||
error: rejection.body_text().to_lowercase(),
|
||||
}),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for Json<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for Json<T> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
pub(crate) mod json;
|
||||
pub(crate) mod path;
|
||||
pub(crate) mod query;
|
||||
|
||||
pub(crate) use json::Json;
|
||||
pub(crate) use path::Path;
|
||||
pub(crate) use query::Query;
|
||||
@@ -1,48 +0,0 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::{
|
||||
async_trait,
|
||||
extract::{rejection::PathRejection, FromRequestParts},
|
||||
};
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::{request::Parts, StatusCode};
|
||||
|
||||
/// Custom `Path` extractor, so that we can format errors into
|
||||
/// `JsonResponse<GenericAPIError>`.
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub(crate) struct Path<T>(pub T);
|
||||
|
||||
#[async_trait]
|
||||
impl<S, T> FromRequestParts<S> for Path<T>
|
||||
where
|
||||
axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
|
||||
S: Send + Sync,
|
||||
{
|
||||
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
|
||||
|
||||
async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
|
||||
match axum::extract::Path::<T>::from_request_parts(parts, state).await {
|
||||
Ok(value) => Ok(Self(value.0)),
|
||||
Err(rejection) => Err((
|
||||
rejection.status(),
|
||||
axum::Json(GenericAPIError {
|
||||
error: rejection.body_text().to_ascii_lowercase(),
|
||||
}),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for Path<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for Path<T> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use axum::{
|
||||
async_trait,
|
||||
extract::{rejection::QueryRejection, FromRequestParts},
|
||||
};
|
||||
use compute_api::responses::GenericAPIError;
|
||||
use http::{request::Parts, StatusCode};
|
||||
|
||||
/// Custom `Query` extractor, so that we can format errors into
|
||||
/// `JsonResponse<GenericAPIError>`.
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub(crate) struct Query<T>(pub T);
|
||||
|
||||
#[async_trait]
|
||||
impl<S, T> FromRequestParts<S> for Query<T>
|
||||
where
|
||||
axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
|
||||
S: Send + Sync,
|
||||
{
|
||||
type Rejection = (StatusCode, axum::Json<GenericAPIError>);
|
||||
|
||||
async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
|
||||
match axum::extract::Query::<T>::from_request_parts(parts, state).await {
|
||||
Ok(value) => Ok(Self(value.0)),
|
||||
Err(rejection) => Err((
|
||||
rejection.status(),
|
||||
axum::Json(GenericAPIError {
|
||||
error: rejection.body_text().to_ascii_lowercase(),
|
||||
}),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Deref for Query<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for Query<T> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
@@ -1,56 +1 @@
|
||||
use axum::{body::Body, response::Response};
|
||||
use compute_api::responses::{ComputeStatus, GenericAPIError};
|
||||
use http::{header::CONTENT_TYPE, StatusCode};
|
||||
use serde::Serialize;
|
||||
use tracing::error;
|
||||
|
||||
pub use server::launch_http_server;
|
||||
|
||||
mod extract;
|
||||
mod routes;
|
||||
mod server;
|
||||
|
||||
/// Convenience response builder for JSON responses
|
||||
struct JsonResponse;
|
||||
|
||||
impl JsonResponse {
|
||||
/// Helper for actually creating a response
|
||||
fn create_response(code: StatusCode, body: impl Serialize) -> Response {
|
||||
Response::builder()
|
||||
.status(code)
|
||||
.header(CONTENT_TYPE.as_str(), "application/json")
|
||||
.body(Body::from(serde_json::to_string(&body).unwrap()))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
/// Create a successful error response
|
||||
pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response {
|
||||
assert!({
|
||||
let code = code.as_u16();
|
||||
|
||||
(200..300).contains(&code)
|
||||
});
|
||||
|
||||
Self::create_response(code, body)
|
||||
}
|
||||
|
||||
/// Create an error response
|
||||
pub(self) fn error(code: StatusCode, error: impl ToString) -> Response {
|
||||
assert!(code.as_u16() >= 400);
|
||||
|
||||
let message = error.to_string();
|
||||
error!(message);
|
||||
|
||||
Self::create_response(code, &GenericAPIError { error: message })
|
||||
}
|
||||
|
||||
/// Create an error response related to the compute being in an invalid state
|
||||
pub(self) fn invalid_status(status: ComputeStatus) -> Response {
|
||||
Self::create_response(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
&GenericAPIError {
|
||||
error: format!("invalid compute status: {status}"),
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
pub mod api;
|
||||
|
||||
@@ -37,7 +37,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeMetrics"
|
||||
|
||||
/metrics:
|
||||
/metrics
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
|
||||
|
||||
/// Check that the compute is currently running.
|
||||
pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
return JsonResponse::invalid_status(status);
|
||||
}
|
||||
|
||||
match check_writability(&compute).await {
|
||||
Ok(_) => JsonResponse::success(StatusCode::OK, true),
|
||||
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
|
||||
}
|
||||
}
|
||||
@@ -1,91 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::{
|
||||
requests::ConfigurationRequest,
|
||||
responses::{ComputeStatus, ComputeStatusResponse},
|
||||
};
|
||||
use http::StatusCode;
|
||||
use tokio::task;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
compute::{ComputeNode, ParsedSpec},
|
||||
http::{extract::Json, JsonResponse},
|
||||
};
|
||||
|
||||
// Accept spec in JSON format and request compute configuration. If anything
|
||||
// goes wrong after we set the compute status to `ConfigurationPending` and
|
||||
// update compute state with new spec, we basically leave compute in the
|
||||
// potentially wrong state. That said, it's control-plane's responsibility to
|
||||
// watch compute state after reconfiguration request and to clean restart in
|
||||
// case of errors.
|
||||
pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
if !compute.live_config_allowed {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"live configuration is not allowed for this compute node".to_string(),
|
||||
);
|
||||
}
|
||||
|
||||
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
};
|
||||
|
||||
// XXX: wrap state update under lock in a code block. Otherwise, we will try
|
||||
// to `Send` `mut state` into the spawned thread bellow, which will cause
|
||||
// the following rustc error:
|
||||
//
|
||||
// error: future cannot be sent between threads safely
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
|
||||
state.pspec = Some(pspec);
|
||||
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
|
||||
drop(state);
|
||||
}
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Running. This is
|
||||
// needed to do not block the main pool of workers and be able to serve
|
||||
// other requests while some particular request is waiting for compute to
|
||||
// finish configuration.
|
||||
let c = compute.clone();
|
||||
let completed = task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Running {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become {}, current status: {}",
|
||||
ComputeStatus::Running,
|
||||
state.status
|
||||
);
|
||||
|
||||
if state.status == ComputeStatus::Failed {
|
||||
let err = state.error.as_ref().map_or("unknown error", |x| x);
|
||||
let msg = format!("compute configuration failed: {:?}", err);
|
||||
return Err(msg);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
if let Err(e) = completed {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
|
||||
}
|
||||
|
||||
// Return current compute state if everything went well.
|
||||
let state = compute.state.lock().unwrap().clone();
|
||||
let body = ComputeStatusResponse::from(&state);
|
||||
|
||||
JsonResponse::success(StatusCode::OK, body)
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{body::Body, extract::State, response::Response};
|
||||
use http::{header::CONTENT_TYPE, StatusCode};
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::{
|
||||
catalog::{get_database_schema, SchemaDumpError},
|
||||
compute::ComputeNode,
|
||||
http::{extract::Query, JsonResponse},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub(in crate::http) struct DatabaseSchemaParams {
|
||||
database: String,
|
||||
}
|
||||
|
||||
/// Get a schema dump of the requested database.
|
||||
pub(in crate::http) async fn get_schema_dump(
|
||||
params: Query<DatabaseSchemaParams>,
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
match get_database_schema(&compute, ¶ms.database).await {
|
||||
Ok(schema) => Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE.as_str(), "application/json")
|
||||
.body(Body::from_stream(schema))
|
||||
.unwrap(),
|
||||
Err(SchemaDumpError::DatabaseDoesNotExist) => {
|
||||
JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist)
|
||||
}
|
||||
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
|
||||
}
|
||||
}
|
||||
@@ -1,16 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
|
||||
|
||||
/// Get the databases and roles from the compute.
|
||||
pub(in crate::http) async fn get_catalog_objects(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
match get_dbs_and_roles(&compute).await {
|
||||
Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects),
|
||||
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
|
||||
}
|
||||
}
|
||||
@@ -1,67 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{
|
||||
extract::State,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use http::StatusCode;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
http::{
|
||||
extract::{Path, Query},
|
||||
JsonResponse,
|
||||
},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub(in crate::http) struct ExtensionServerParams {
|
||||
is_library: Option<bool>,
|
||||
}
|
||||
|
||||
/// Download a remote extension.
|
||||
pub(in crate::http) async fn download_extension(
|
||||
Path(filename): Path<String>,
|
||||
params: Query<ExtensionServerParams>,
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
// Don't even try to download extensions if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"remote storage is not configured",
|
||||
);
|
||||
}
|
||||
|
||||
let ext = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let pspec = state.pspec.as_ref().unwrap();
|
||||
let spec = &pspec.spec;
|
||||
|
||||
let remote_extensions = match spec.remote_extensions.as_ref() {
|
||||
Some(r) => r,
|
||||
None => {
|
||||
return JsonResponse::error(
|
||||
StatusCode::CONFLICT,
|
||||
"information about remote extensions is unavailable",
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
remote_extensions.get_ext(
|
||||
&filename,
|
||||
params.is_library.unwrap_or(false),
|
||||
&compute.build_tag,
|
||||
&compute.pgversion,
|
||||
)
|
||||
};
|
||||
|
||||
match ext {
|
||||
Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await {
|
||||
Ok(_) => StatusCode::OK.into_response(),
|
||||
Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
|
||||
},
|
||||
Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e),
|
||||
}
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::{
|
||||
requests::ExtensionInstallRequest,
|
||||
responses::{ComputeStatus, ExtensionInstallResponse},
|
||||
};
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
http::{extract::Json, JsonResponse},
|
||||
};
|
||||
|
||||
/// Install a extension.
|
||||
pub(in crate::http) async fn install_extension(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ExtensionInstallRequest>,
|
||||
) -> Response {
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
return JsonResponse::invalid_status(status);
|
||||
}
|
||||
|
||||
match compute
|
||||
.install_extension(
|
||||
&request.extension,
|
||||
&request.database,
|
||||
request.version.to_string(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(version) => JsonResponse::success(
|
||||
StatusCode::CREATED,
|
||||
Some(ExtensionInstallResponse {
|
||||
extension: request.extension.clone(),
|
||||
version,
|
||||
}),
|
||||
),
|
||||
Err(e) => JsonResponse::error(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
format!("failed to install extension: {e}"),
|
||||
),
|
||||
}
|
||||
}
|
||||
@@ -1,35 +0,0 @@
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use http::StatusCode;
|
||||
use tracing::info;
|
||||
use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
|
||||
|
||||
use crate::http::{extract::Json, JsonResponse};
|
||||
|
||||
/// Configure failpoints for testing purposes.
|
||||
pub(in crate::http) async fn configure_failpoints(
|
||||
failpoints: Json<ConfigureFailpointsRequest>,
|
||||
) -> Response {
|
||||
if !fail::has_failpoints() {
|
||||
return JsonResponse::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
"Cannot manage failpoints because neon was compiled without failpoints support",
|
||||
);
|
||||
}
|
||||
|
||||
for fp in &*failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(e) = cfg_result {
|
||||
return JsonResponse::error(
|
||||
StatusCode::BAD_REQUEST,
|
||||
format!("failed to configure failpoints: {e}"),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
StatusCode::OK.into_response()
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::{
|
||||
requests::SetRoleGrantsRequest,
|
||||
responses::{ComputeStatus, SetRoleGrantsResponse},
|
||||
};
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{
|
||||
compute::ComputeNode,
|
||||
http::{extract::Json, JsonResponse},
|
||||
};
|
||||
|
||||
/// Add grants for a role.
|
||||
pub(in crate::http) async fn add_grant(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<SetRoleGrantsRequest>,
|
||||
) -> Response {
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
return JsonResponse::invalid_status(status);
|
||||
}
|
||||
|
||||
match compute
|
||||
.set_role_grants(
|
||||
&request.database,
|
||||
&request.schema,
|
||||
&request.privileges,
|
||||
&request.role,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => JsonResponse::success(
|
||||
StatusCode::CREATED,
|
||||
Some(SetRoleGrantsResponse {
|
||||
database: request.database.clone(),
|
||||
schema: request.schema.clone(),
|
||||
role: request.role.clone(),
|
||||
privileges: request.privileges.clone(),
|
||||
}),
|
||||
),
|
||||
Err(e) => JsonResponse::error(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
format!("failed to grant role privileges to the schema: {e}"),
|
||||
),
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
use axum::response::Response;
|
||||
use compute_api::responses::InfoResponse;
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::http::JsonResponse;
|
||||
|
||||
/// Get information about the physical characteristics about the compute.
|
||||
pub(in crate::http) async fn get_info() -> Response {
|
||||
let num_cpus = num_cpus::get_physical();
|
||||
JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
|
||||
}
|
||||
@@ -1,18 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{compute::ComputeNode, http::JsonResponse};
|
||||
|
||||
/// Collect current Postgres usage insights.
|
||||
pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
return JsonResponse::invalid_status(status);
|
||||
}
|
||||
|
||||
let insights = compute.collect_insights().await;
|
||||
JsonResponse::success(StatusCode::OK, insights)
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use http::StatusCode;
|
||||
use tokio::task;
|
||||
|
||||
use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
|
||||
|
||||
/// Get a list of installed extensions.
|
||||
pub(in crate::http) async fn get_installed_extensions(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
) -> Response {
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
return JsonResponse::invalid_status(status);
|
||||
}
|
||||
|
||||
let conf = compute.get_conn_conf(None);
|
||||
let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
match res {
|
||||
Ok(installed_extensions) => {
|
||||
JsonResponse::success(StatusCode::OK, Some(installed_extensions))
|
||||
}
|
||||
Err(e) => JsonResponse::error(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
format!("failed to get list of installed extensions: {e}"),
|
||||
),
|
||||
}
|
||||
}
|
||||
@@ -1,32 +0,0 @@
|
||||
use axum::{body::Body, response::Response};
|
||||
use http::header::CONTENT_TYPE;
|
||||
use http::StatusCode;
|
||||
use metrics::proto::MetricFamily;
|
||||
use metrics::Encoder;
|
||||
use metrics::TextEncoder;
|
||||
|
||||
use crate::{http::JsonResponse, installed_extensions};
|
||||
|
||||
/// Expose Prometheus metrics.
|
||||
pub(in crate::http) async fn get_metrics() -> Response {
|
||||
// When we call TextEncoder::encode() below, it will immediately return an
|
||||
// error if a metric family has no metrics, so we need to preemptively
|
||||
// filter out metric families with no metrics.
|
||||
let metrics = installed_extensions::collect()
|
||||
.into_iter()
|
||||
.filter(|m| !m.get_metric().is_empty())
|
||||
.collect::<Vec<MetricFamily>>();
|
||||
|
||||
let encoder = TextEncoder::new();
|
||||
let mut buffer = vec![];
|
||||
|
||||
if let Err(e) = encoder.encode(&metrics, &mut buffer) {
|
||||
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
|
||||
}
|
||||
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(CONTENT_TYPE, encoder.format_type())
|
||||
.body(Body::from(buffer))
|
||||
.unwrap()
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{extract::State, response::Response};
|
||||
use http::StatusCode;
|
||||
|
||||
use crate::{compute::ComputeNode, http::JsonResponse};
|
||||
|
||||
/// Get startup metrics.
|
||||
pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
let metrics = compute.state.lock().unwrap().metrics.clone();
|
||||
JsonResponse::success(StatusCode::OK, metrics)
|
||||
}
|
||||
@@ -1,38 +0,0 @@
|
||||
use compute_api::responses::ComputeStatusResponse;
|
||||
|
||||
use crate::compute::ComputeState;
|
||||
|
||||
pub(in crate::http) mod check_writability;
|
||||
pub(in crate::http) mod configure;
|
||||
pub(in crate::http) mod database_schema;
|
||||
pub(in crate::http) mod dbs_and_roles;
|
||||
pub(in crate::http) mod extension_server;
|
||||
pub(in crate::http) mod extensions;
|
||||
pub(in crate::http) mod failpoints;
|
||||
pub(in crate::http) mod grants;
|
||||
pub(in crate::http) mod info;
|
||||
pub(in crate::http) mod insights;
|
||||
pub(in crate::http) mod installed_extensions;
|
||||
pub(in crate::http) mod metrics;
|
||||
pub(in crate::http) mod metrics_json;
|
||||
pub(in crate::http) mod status;
|
||||
pub(in crate::http) mod terminate;
|
||||
|
||||
impl From<&ComputeState> for ComputeStatusResponse {
|
||||
fn from(state: &ComputeState) -> Self {
|
||||
ComputeStatusResponse {
|
||||
start_time: state.start_time,
|
||||
tenant: state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.map(|pspec| pspec.tenant_id.to_string()),
|
||||
timeline: state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.map(|pspec| pspec.timeline_id.to_string()),
|
||||
status: state.status,
|
||||
last_active: state.last_active,
|
||||
error: state.error.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
use std::{ops::Deref, sync::Arc};
|
||||
|
||||
use axum::{extract::State, http::StatusCode, response::Response};
|
||||
use compute_api::responses::ComputeStatusResponse;
|
||||
|
||||
use crate::{compute::ComputeNode, http::JsonResponse};
|
||||
|
||||
/// Retrieve the state of the comute.
|
||||
pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let body = ComputeStatusResponse::from(state.deref());
|
||||
|
||||
JsonResponse::success(StatusCode::OK, body)
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use axum::{
|
||||
extract::State,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use compute_api::responses::ComputeStatus;
|
||||
use http::StatusCode;
|
||||
use tokio::task;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
compute::{forward_termination_signal, ComputeNode},
|
||||
http::JsonResponse,
|
||||
};
|
||||
|
||||
/// Terminate the compute.
|
||||
pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::Terminated {
|
||||
return StatusCode::CREATED.into_response();
|
||||
}
|
||||
|
||||
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
|
||||
return JsonResponse::invalid_status(state.status);
|
||||
}
|
||||
|
||||
state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
|
||||
drop(state);
|
||||
}
|
||||
|
||||
forward_termination_signal();
|
||||
info!("sent signal and notified waiters");
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Terminated.
|
||||
// This is needed to do not block the main pool of workers and
|
||||
// be able to serve other requests while some particular request
|
||||
// is waiting for compute to finish configuration.
|
||||
let c = compute.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Terminated {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become {}, current status: {:?}",
|
||||
ComputeStatus::Terminated,
|
||||
state.status
|
||||
);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
info!("terminated Postgres");
|
||||
|
||||
StatusCode::OK.into_response()
|
||||
}
|
||||
@@ -1,165 +0,0 @@
|
||||
use std::{
|
||||
net::{IpAddr, Ipv6Addr, SocketAddr},
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use axum::{
|
||||
response::{IntoResponse, Response},
|
||||
routing::{get, post},
|
||||
Router,
|
||||
};
|
||||
use http::StatusCode;
|
||||
use tokio::net::TcpListener;
|
||||
use tower::ServiceBuilder;
|
||||
use tower_http::{
|
||||
request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
|
||||
trace::TraceLayer,
|
||||
};
|
||||
use tracing::{debug, error, info, Span};
|
||||
|
||||
use super::routes::{
|
||||
check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
|
||||
grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
|
||||
terminate,
|
||||
};
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
async fn handle_404() -> Response {
|
||||
StatusCode::NOT_FOUND.into_response()
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct ComputeMakeRequestId(Arc<AtomicU64>);
|
||||
|
||||
impl MakeRequestId for ComputeMakeRequestId {
|
||||
fn make_request_id<B>(
|
||||
&mut self,
|
||||
_request: &http::Request<B>,
|
||||
) -> Option<tower_http::request_id::RequestId> {
|
||||
let request_id = self
|
||||
.0
|
||||
.fetch_add(1, Ordering::SeqCst)
|
||||
.to_string()
|
||||
.parse()
|
||||
.unwrap();
|
||||
|
||||
Some(RequestId::new(request_id))
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the HTTP server and wait on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, compute: Arc<ComputeNode>) {
|
||||
const X_REQUEST_ID: &str = "x-request-id";
|
||||
|
||||
let mut app = Router::new()
|
||||
.route("/check_writability", post(check_writability::is_writable))
|
||||
.route("/configure", post(configure::configure))
|
||||
.route("/database_schema", get(database_schema::get_schema_dump))
|
||||
.route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
|
||||
.route(
|
||||
"/extension_server/*filename",
|
||||
post(extension_server::download_extension),
|
||||
)
|
||||
.route("/extensions", post(extensions::install_extension))
|
||||
.route("/grants", post(grants::add_grant))
|
||||
.route("/info", get(info_route::get_info))
|
||||
.route("/insights", get(insights::get_insights))
|
||||
.route(
|
||||
"/installed_extensions",
|
||||
get(installed_extensions::get_installed_extensions),
|
||||
)
|
||||
.route("/metrics", get(metrics::get_metrics))
|
||||
.route("/metrics.json", get(metrics_json::get_metrics))
|
||||
.route("/status", get(status::get_status))
|
||||
.route("/terminate", post(terminate::terminate))
|
||||
.fallback(handle_404)
|
||||
.layer(
|
||||
ServiceBuilder::new()
|
||||
.layer(SetRequestIdLayer::x_request_id(
|
||||
ComputeMakeRequestId::default(),
|
||||
))
|
||||
.layer(
|
||||
TraceLayer::new_for_http()
|
||||
.on_request(|request: &http::Request<_>, _span: &Span| {
|
||||
let request_id = request
|
||||
.headers()
|
||||
.get(X_REQUEST_ID)
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap();
|
||||
|
||||
match request.uri().path() {
|
||||
"/metrics" => {
|
||||
debug!(%request_id, "{} {}", request.method(), request.uri())
|
||||
}
|
||||
_ => info!(%request_id, "{} {}", request.method(), request.uri()),
|
||||
};
|
||||
})
|
||||
.on_response(
|
||||
|response: &http::Response<_>, latency: Duration, _span: &Span| {
|
||||
let request_id = response
|
||||
.headers()
|
||||
.get(X_REQUEST_ID)
|
||||
.unwrap()
|
||||
.to_str()
|
||||
.unwrap();
|
||||
|
||||
info!(
|
||||
%request_id,
|
||||
code = response.status().as_u16(),
|
||||
latency = latency.as_millis()
|
||||
)
|
||||
},
|
||||
),
|
||||
)
|
||||
.layer(PropagateRequestIdLayer::x_request_id()),
|
||||
)
|
||||
.with_state(compute);
|
||||
|
||||
// Add in any testing support
|
||||
if cfg!(feature = "testing") {
|
||||
use super::routes::failpoints;
|
||||
|
||||
app = app.route("/failpoints", post(failpoints::configure_failpoints))
|
||||
}
|
||||
|
||||
// This usually binds to both IPv4 and IPv6 on Linux, see
|
||||
// https://github.com/rust-lang/rust/pull/34440 for more information
|
||||
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
|
||||
let listener = match TcpListener::bind(&addr).await {
|
||||
Ok(listener) => listener,
|
||||
Err(e) => {
|
||||
error!(
|
||||
"failed to bind the compute_ctl HTTP server to port {}: {}",
|
||||
port, e
|
||||
);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if let Ok(local_addr) = listener.local_addr() {
|
||||
info!("compute_ctl HTTP server listening on {}", local_addr);
|
||||
} else {
|
||||
info!("compute_ctl HTTP server listening on port {}", port);
|
||||
}
|
||||
|
||||
if let Err(e) = axum::serve(listener, app).await {
|
||||
error!("compute_ctl HTTP server error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch a separate HTTP server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
.name("http-server".into())
|
||||
.spawn(move || serve(port, state))?)
|
||||
}
|
||||
@@ -3,6 +3,8 @@
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
extern crate hyper0 as hyper;
|
||||
|
||||
pub mod checker;
|
||||
pub mod config;
|
||||
pub mod configurator;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::{Context, Result};
|
||||
use fail::fail_point;
|
||||
use postgres::{Client, Transaction};
|
||||
use postgres::Client;
|
||||
use tracing::info;
|
||||
|
||||
/// Runs a series of migrations on a target database
|
||||
@@ -20,9 +20,11 @@ impl<'m> MigrationRunner<'m> {
|
||||
|
||||
/// Get the current value neon_migration.migration_id
|
||||
fn get_migration_id(&mut self) -> Result<i64> {
|
||||
let query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = self
|
||||
.client
|
||||
.query_one("SELECT id FROM neon_migration.migration_id", &[])?;
|
||||
.query_one(query, &[])
|
||||
.context("run_migrations get migration_id")?;
|
||||
|
||||
Ok(row.get::<&str, i64>("id"))
|
||||
}
|
||||
@@ -32,7 +34,7 @@ impl<'m> MigrationRunner<'m> {
|
||||
/// This function has a fail point called compute-migration, which can be
|
||||
/// used if you would like to fail the application of a series of migrations
|
||||
/// at some point.
|
||||
fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
|
||||
fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
|
||||
// We use this fail point in order to check that failing in the
|
||||
// middle of applying a series of migrations fails in an expected
|
||||
// manner
|
||||
@@ -53,11 +55,12 @@ impl<'m> MigrationRunner<'m> {
|
||||
}
|
||||
}
|
||||
|
||||
txn.query(
|
||||
"UPDATE neon_migration.migration_id SET id = $1",
|
||||
&[&migration_id],
|
||||
)
|
||||
.with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;
|
||||
self.client
|
||||
.query(
|
||||
"UPDATE neon_migration.migration_id SET id = $1",
|
||||
&[&migration_id],
|
||||
)
|
||||
.context("run_migrations update id")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -78,50 +81,53 @@ impl<'m> MigrationRunner<'m> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run an individual migration
|
||||
fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", migration_id);
|
||||
|
||||
// Even though we are skipping the migration, updating the
|
||||
// migration ID should help keep logic easy to understand when
|
||||
// trying to understand the state of a cluster.
|
||||
Self::update_migration_id(txn, migration_id)?;
|
||||
} else {
|
||||
info!("Running migration id={}:\n{}\n", migration_id, migration);
|
||||
|
||||
txn.simple_query(migration)
|
||||
.with_context(|| format!("apply migration {migration_id}"))?;
|
||||
|
||||
Self::update_migration_id(txn, migration_id)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run the configured set of migrations
|
||||
/// Run the configrured set of migrations
|
||||
pub fn run_migrations(mut self) -> Result<()> {
|
||||
self.prepare_database()
|
||||
.context("prepare database to handle migrations")?;
|
||||
self.prepare_database()?;
|
||||
|
||||
let mut current_migration = self.get_migration_id()? as usize;
|
||||
while current_migration < self.migrations.len() {
|
||||
// The index lags the migration ID by 1, so the current migration
|
||||
// ID is also the next index
|
||||
let migration_id = (current_migration + 1) as i64;
|
||||
macro_rules! migration_id {
|
||||
($cm:expr) => {
|
||||
($cm + 1) as i64
|
||||
};
|
||||
}
|
||||
|
||||
let mut txn = self
|
||||
.client
|
||||
.transaction()
|
||||
.with_context(|| format!("begin transaction for migration {migration_id}"))?;
|
||||
let migration = self.migrations[current_migration];
|
||||
|
||||
Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
|
||||
.with_context(|| format!("running migration {migration_id}"))?;
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", migration_id!(current_migration));
|
||||
|
||||
txn.commit()
|
||||
.with_context(|| format!("commit transaction for migration {migration_id}"))?;
|
||||
// Even though we are skipping the migration, updating the
|
||||
// migration ID should help keep logic easy to understand when
|
||||
// trying to understand the state of a cluster.
|
||||
self.update_migration_id(migration_id!(current_migration))?;
|
||||
} else {
|
||||
info!(
|
||||
"Running migration id={}:\n{}\n",
|
||||
migration_id!(current_migration),
|
||||
migration
|
||||
);
|
||||
|
||||
info!("Finished migration id={}", migration_id);
|
||||
self.client
|
||||
.simple_query("BEGIN")
|
||||
.context("begin migration")?;
|
||||
|
||||
self.client.simple_query(migration).with_context(|| {
|
||||
format!(
|
||||
"run_migrations migration id={}",
|
||||
migration_id!(current_migration)
|
||||
)
|
||||
})?;
|
||||
|
||||
self.update_migration_id(migration_id!(current_migration))?;
|
||||
|
||||
self.client
|
||||
.simple_query("COMMIT")
|
||||
.context("commit migration")?;
|
||||
|
||||
info!("Finished migration id={}", migration_id!(current_migration));
|
||||
}
|
||||
|
||||
current_migration += 1;
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ pub struct MutableApplyContext {
|
||||
pub dbs: HashMap<String, Database>,
|
||||
}
|
||||
|
||||
/// Apply the operations that belong to the given spec apply phase.
|
||||
/// Appply the operations that belong to the given spec apply phase.
|
||||
///
|
||||
/// Commands within a single phase are executed in order of Iterator yield.
|
||||
/// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
|
||||
@@ -498,19 +498,7 @@ async fn get_operations<'a>(
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
// Revoke some potentially blocking privileges (Neon-specific currently)
|
||||
Operation {
|
||||
query: format!(
|
||||
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
|
||||
role_name = quoted,
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
// This now will only drop privileges of the role
|
||||
// TODO: this is obviously not 100% true because of the above case,
|
||||
// there could be still some privileges that are not revoked. Maybe this
|
||||
// only drops privileges that were granted *by this* role, not *to this* role,
|
||||
// but this has to be checked.
|
||||
Operation {
|
||||
query: format!("DROP OWNED BY {}", quoted),
|
||||
comment: None,
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
SET SESSION ROLE neon_superuser;
|
||||
|
||||
DO $$
|
||||
DECLARE
|
||||
schema TEXT;
|
||||
revoke_query TEXT;
|
||||
BEGIN
|
||||
FOR schema IN
|
||||
SELECT schema_name
|
||||
FROM information_schema.schemata
|
||||
-- So far, we only had issues with 'public' schema. Probably, because we do some additional grants,
|
||||
-- e.g., make DB owner the owner of 'public' schema automatically (when created via API).
|
||||
-- See https://github.com/neondatabase/cloud/issues/13582 for the context.
|
||||
-- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema,
|
||||
-- ii) it's easy to add more schemas to the list if needed.
|
||||
WHERE schema_name IN ('public')
|
||||
LOOP
|
||||
revoke_query := format(
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
|
||||
schema
|
||||
);
|
||||
|
||||
EXECUTE revoke_query;
|
||||
END LOOP;
|
||||
END;
|
||||
$$;
|
||||
|
||||
RESET ROLE;
|
||||
@@ -62,7 +62,7 @@ use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage_controller::StorageController;
|
||||
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||
|
||||
// contents of a endpoint.json file
|
||||
@@ -739,7 +739,7 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
// Call the /status HTTP API
|
||||
pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
|
||||
pub async fn get_status(&self) -> Result<ComputeState> {
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
let response = client
|
||||
|
||||
@@ -483,6 +483,7 @@ impl LocalEnv {
|
||||
.iter()
|
||||
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
|
||||
.map(|&(_, timeline_id)| timeline_id)
|
||||
.map(TimelineId::from)
|
||||
}
|
||||
|
||||
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
|
||||
|
||||
@@ -822,7 +822,10 @@ impl StorageController {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(TenantShardMigrateRequest { node_id }),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id,
|
||||
node_id,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -1,16 +1,12 @@
|
||||
use futures::StreamExt;
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
str::FromStr,
|
||||
time::Duration,
|
||||
};
|
||||
use std::{str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
|
||||
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
@@ -116,13 +112,6 @@ enum Command {
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
},
|
||||
/// Migrate the secondary location for a tenant shard to a specific pageserver.
|
||||
TenantShardMigrateSecondary {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
},
|
||||
/// Cancel any ongoing reconciliation for this shard
|
||||
TenantShardCancelReconcile {
|
||||
#[arg(long)]
|
||||
@@ -157,12 +146,6 @@ enum Command {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
TenantSetPreferredAz {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
preferred_az: Option<String>,
|
||||
},
|
||||
/// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
|
||||
/// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
|
||||
TenantDrop {
|
||||
@@ -412,12 +395,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
|
||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
table.add_row([
|
||||
format!("{}", node.id),
|
||||
node.listen_http_addr,
|
||||
node.availability_zone_id,
|
||||
format!("{:?}", node.scheduling),
|
||||
format!("{:?}", node.availability),
|
||||
]);
|
||||
@@ -490,7 +472,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"TenantId",
|
||||
"Preferred AZ",
|
||||
"ShardCount",
|
||||
"StripeSize",
|
||||
"Placement",
|
||||
@@ -500,11 +481,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
let shard_zero = tenant.shards.into_iter().next().unwrap();
|
||||
table.add_row([
|
||||
format!("{}", tenant.tenant_id),
|
||||
shard_zero
|
||||
.preferred_az_id
|
||||
.as_ref()
|
||||
.cloned()
|
||||
.unwrap_or("".to_string()),
|
||||
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
|
||||
format!("{:?}", tenant.stripe_size),
|
||||
format!("{:?}", tenant.policy),
|
||||
@@ -564,7 +540,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest { node_id: node };
|
||||
let req = TenantShardMigrateRequest {
|
||||
tenant_shard_id,
|
||||
node_id: node,
|
||||
};
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
@@ -574,20 +553,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardMigrateSecondary {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest { node_id: node };
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardCancelReconcile { tenant_shard_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
@@ -631,19 +596,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let nodes = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let nodes = nodes
|
||||
.into_iter()
|
||||
.map(|n| (n.id, n))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
println!("Tenant {tenant_id}");
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.add_row(["Policy", &format!("{:?}", policy)]);
|
||||
@@ -652,14 +604,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
println!("{table}");
|
||||
println!("Shards:");
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"Shard",
|
||||
"Attached",
|
||||
"Attached AZ",
|
||||
"Secondary",
|
||||
"Last error",
|
||||
"status",
|
||||
]);
|
||||
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
|
||||
for shard in shards {
|
||||
let secondary = shard
|
||||
.node_secondary
|
||||
@@ -682,18 +627,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
let status = status_parts.join(",");
|
||||
|
||||
let attached_node = shard
|
||||
.node_attached
|
||||
.as_ref()
|
||||
.map(|id| nodes.get(id).expect("Shard references nonexistent node"));
|
||||
|
||||
table.add_row([
|
||||
format!("{}", shard.tenant_shard_id),
|
||||
attached_node
|
||||
.map(|n| format!("{} ({})", n.listen_http_addr, n.id))
|
||||
.unwrap_or(String::new()),
|
||||
attached_node
|
||||
.map(|n| n.availability_zone_id.clone())
|
||||
shard
|
||||
.node_attached
|
||||
.map(|n| format!("{}", n))
|
||||
.unwrap_or(String::new()),
|
||||
secondary,
|
||||
shard.last_error,
|
||||
@@ -702,66 +640,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantSetPreferredAz {
|
||||
tenant_id,
|
||||
preferred_az,
|
||||
} => {
|
||||
// First learn about the tenant's shards
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Learn about nodes to validate the AZ ID
|
||||
let nodes = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(preferred_az) = &preferred_az {
|
||||
let azs = nodes
|
||||
.into_iter()
|
||||
.map(|n| (n.availability_zone_id))
|
||||
.collect::<HashSet<_>>();
|
||||
if !azs.contains(preferred_az) {
|
||||
anyhow::bail!(
|
||||
"AZ {} not found on any node: known AZs are: {:?}",
|
||||
preferred_az,
|
||||
azs
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// Make it obvious to the user that since they've omitted an AZ, we're clearing it
|
||||
eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
|
||||
}
|
||||
|
||||
// Construct a request that modifies all the tenant's shards
|
||||
let req = ShardsPreferredAzsRequest {
|
||||
preferred_az_ids: describe_response
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|s| {
|
||||
(
|
||||
s.tenant_shard_id,
|
||||
preferred_az.clone().map(AvailabilityZone),
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
storcon_client
|
||||
.dispatch::<ShardsPreferredAzsRequest, ()>(
|
||||
Method::PUT,
|
||||
"control/v1/preferred_azs".to_string(),
|
||||
Some(req),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantWarmup { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
@@ -1037,7 +915,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
|
||||
Some(TenantShardMigrateRequest { node_id: mv.to }),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id: mv.tenant_shard_id,
|
||||
node_id: mv.to,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
|
||||
@@ -1154,15 +1035,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
resp.sort_by(|a, b| a.id.cmp(&b.id));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"Id",
|
||||
"Version",
|
||||
"Host",
|
||||
"Port",
|
||||
"Http Port",
|
||||
"AZ Id",
|
||||
"Scheduling",
|
||||
]);
|
||||
table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]);
|
||||
for sk in resp {
|
||||
table.add_row([
|
||||
format!("{}", sk.id),
|
||||
@@ -1170,8 +1043,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
sk.host,
|
||||
format!("{}", sk.port),
|
||||
format!("{}", sk.http_port),
|
||||
sk.availability_zone_id.clone(),
|
||||
String::from(sk.scheduling_policy),
|
||||
sk.availability_zone_id.to_string(),
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
|
||||
@@ -7,11 +7,15 @@ Currently we build two main images:
|
||||
- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
|
||||
|
||||
And additional intermediate image:
|
||||
|
||||
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
|
||||
|
||||
## Build pipeline
|
||||
|
||||
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
|
||||
|
||||
1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)
|
||||
1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
|
||||
|
||||
2. `neondatabase/neon`
|
||||
|
||||
|
||||
@@ -15,17 +15,6 @@ pub struct GenericAPIError {
|
||||
pub error: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct InfoResponse {
|
||||
pub num_cpus: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct ExtensionInstallResponse {
|
||||
pub extension: PgIdent,
|
||||
pub version: ExtVersion,
|
||||
}
|
||||
|
||||
/// Response of the /status API
|
||||
#[derive(Serialize, Debug, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
@@ -39,6 +28,16 @@ pub struct ComputeStatusResponse {
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeState {
|
||||
pub status: ComputeStatus,
|
||||
/// Timestamp of the last Postgres activity
|
||||
#[serde(serialize_with = "rfc3339_serialize")]
|
||||
pub last_active: Option<DateTime<Utc>>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
@@ -79,7 +78,7 @@ impl Display for ComputeStatus {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
|
||||
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
|
||||
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ShardsPreferredAzsRequest {
|
||||
#[serde(flatten)]
|
||||
pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
|
||||
pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -144,8 +144,6 @@ pub struct NodeDescribeResponse {
|
||||
pub availability: NodeAvailabilityWrapper,
|
||||
pub scheduling: NodeSchedulingPolicy,
|
||||
|
||||
pub availability_zone_id: String,
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
@@ -181,6 +179,7 @@ pub struct TenantDescribeResponseShard {
|
||||
/// specifies some constraints, e.g. asking it to get off particular node(s)
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
@@ -321,38 +320,6 @@ impl From<NodeSchedulingPolicy> for String {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum SkSchedulingPolicy {
|
||||
Active,
|
||||
Disabled,
|
||||
Decomissioned,
|
||||
}
|
||||
|
||||
impl FromStr for SkSchedulingPolicy {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
Ok(match s {
|
||||
"active" => Self::Active,
|
||||
"disabled" => Self::Disabled,
|
||||
"decomissioned" => Self::Decomissioned,
|
||||
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SkSchedulingPolicy> for String {
|
||||
fn from(value: SkSchedulingPolicy) -> String {
|
||||
use SkSchedulingPolicy::*;
|
||||
match value {
|
||||
Active => "active",
|
||||
Disabled => "disabled",
|
||||
Decomissioned => "decomissioned",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
|
||||
/// to create secondary locations.
|
||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
@@ -369,16 +336,6 @@ pub enum PlacementPolicy {
|
||||
Detached,
|
||||
}
|
||||
|
||||
impl PlacementPolicy {
|
||||
pub fn want_secondaries(&self) -> usize {
|
||||
match self {
|
||||
PlacementPolicy::Attached(secondary_count) => *secondary_count,
|
||||
PlacementPolicy::Secondary => 1,
|
||||
PlacementPolicy::Detached => 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateResponse {}
|
||||
|
||||
@@ -430,7 +387,6 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub port: i32,
|
||||
pub http_port: i32,
|
||||
pub availability_zone_id: String,
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
|
||||
/// Non inherited range for vectored get.
|
||||
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||
/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
|
||||
pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
|
||||
pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
|
||||
|
||||
impl Key {
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
@@ -714,42 +714,7 @@ impl Key {
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(self) -> bool {
|
||||
if self.is_sparse() {
|
||||
self.is_inherited_sparse_key()
|
||||
} else {
|
||||
!NON_INHERITED_RANGE.contains(&self)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_sparse(self) -> bool {
|
||||
self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
|
||||
}
|
||||
|
||||
/// Check if the key belongs to the inherited keyspace.
|
||||
fn is_inherited_sparse_key(self) -> bool {
|
||||
debug_assert!(self.is_sparse());
|
||||
self.field1 == RELATION_SIZE_PREFIX
|
||||
}
|
||||
|
||||
pub fn sparse_non_inherited_keyspace() -> Range<Key> {
|
||||
// The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
|
||||
debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
|
||||
Key {
|
||||
field1: AUX_KEY_PREFIX,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: REPL_ORIGIN_KEY_PREFIX + 1,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}
|
||||
!NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
|
||||
@@ -272,8 +272,6 @@ pub struct CompactInfoResponse {
|
||||
pub compact_key_range: Option<CompactKeyRange>,
|
||||
pub compact_lsn_range: Option<CompactLsnRange>,
|
||||
pub sub_compaction: bool,
|
||||
pub running: bool,
|
||||
pub job_id: usize,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -1400,8 +1398,6 @@ pub enum PagestreamFeMessage {
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentRequest),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -1413,8 +1409,6 @@ pub enum PagestreamBeMessage {
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
GetSlruSegment(PagestreamGetSlruSegmentResponse),
|
||||
#[cfg(feature = "testing")]
|
||||
Test(PagestreamTestResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
@@ -1426,9 +1420,6 @@ enum PagestreamBeMessageTag {
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
GetSlruSegment = 105,
|
||||
/// Test message discrimimant is unstable
|
||||
#[cfg(feature = "testing")]
|
||||
Test = 106,
|
||||
}
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
@@ -1440,8 +1431,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
|
||||
#[cfg(feature = "testing")]
|
||||
106 => Ok(PagestreamBeMessageTag::Test),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
@@ -1471,108 +1460,78 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
|
||||
// difference in the responses between V1 and V2.
|
||||
//
|
||||
// V3 version of protocol adds request ID to all requests. This request ID is also included in response
|
||||
// as well as other fields from requests, which allows to verify that we receive response for our request.
|
||||
// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
|
||||
// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
|
||||
//
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PagestreamProtocolVersion {
|
||||
V2,
|
||||
V3,
|
||||
}
|
||||
|
||||
pub type RequestId = u64;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamRequest {
|
||||
pub reqid: RequestId,
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamNblocksRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since: Lsn,
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamGetPageRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since: Lsn,
|
||||
pub rel: RelTag,
|
||||
pub blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamDbSizeRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since: Lsn,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamGetSlruSegmentRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub request_lsn: Lsn,
|
||||
pub not_modified_since: Lsn,
|
||||
pub kind: u8,
|
||||
pub segno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamExistsResponse {
|
||||
pub req: PagestreamExistsRequest,
|
||||
pub exists: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamNblocksResponse {
|
||||
pub req: PagestreamNblocksRequest,
|
||||
pub n_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetPageResponse {
|
||||
pub req: PagestreamGetPageRequest,
|
||||
pub page: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamGetSlruSegmentResponse {
|
||||
pub req: PagestreamGetSlruSegmentRequest,
|
||||
pub segment: Bytes,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamErrorResponse {
|
||||
pub req: PagestreamRequest,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamDbSizeResponse {
|
||||
pub req: PagestreamDbSizeRequest,
|
||||
pub db_size: i64,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct PagestreamTestRequest {
|
||||
pub hdr: PagestreamRequest,
|
||||
pub batch_key: u64,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[derive(Debug)]
|
||||
pub struct PagestreamTestResponse {
|
||||
pub req: PagestreamTestRequest,
|
||||
}
|
||||
|
||||
// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
|
||||
// that require pageserver-internal types. It is sufficient to get the total size.
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@@ -1586,16 +1545,15 @@ pub struct TenantHistorySize {
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
/// Serialize a compute -> pageserver message. This is currently only used in testing
|
||||
/// tools. Always uses protocol version 3.
|
||||
/// tools. Always uses protocol version 2.
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Exists(req) => {
|
||||
bytes.put_u8(0);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.request_lsn.0);
|
||||
bytes.put_u64(req.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
@@ -1604,9 +1562,8 @@ impl PagestreamFeMessage {
|
||||
|
||||
Self::Nblocks(req) => {
|
||||
bytes.put_u8(1);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.request_lsn.0);
|
||||
bytes.put_u64(req.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
@@ -1615,9 +1572,8 @@ impl PagestreamFeMessage {
|
||||
|
||||
Self::GetPage(req) => {
|
||||
bytes.put_u8(2);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.request_lsn.0);
|
||||
bytes.put_u64(req.not_modified_since.0);
|
||||
bytes.put_u32(req.rel.spcnode);
|
||||
bytes.put_u32(req.rel.dbnode);
|
||||
bytes.put_u32(req.rel.relnode);
|
||||
@@ -1627,65 +1583,38 @@ impl PagestreamFeMessage {
|
||||
|
||||
Self::DbSize(req) => {
|
||||
bytes.put_u8(3);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.request_lsn.0);
|
||||
bytes.put_u64(req.not_modified_since.0);
|
||||
bytes.put_u32(req.dbnode);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(req) => {
|
||||
bytes.put_u8(4);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.request_lsn.0);
|
||||
bytes.put_u64(req.not_modified_since.0);
|
||||
bytes.put_u8(req.kind);
|
||||
bytes.put_u32(req.segno);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(req) => {
|
||||
bytes.put_u8(5);
|
||||
bytes.put_u64(req.hdr.reqid);
|
||||
bytes.put_u64(req.hdr.request_lsn.0);
|
||||
bytes.put_u64(req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(req.batch_key);
|
||||
let message = req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn parse<R: std::io::Read>(
|
||||
body: &mut R,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
) -> anyhow::Result<PagestreamFeMessage> {
|
||||
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.read_u8()?;
|
||||
let (reqid, request_lsn, not_modified_since) = match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => (
|
||||
0,
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
PagestreamProtocolVersion::V3 => (
|
||||
body.read_u64::<BigEndian>()?,
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
};
|
||||
|
||||
// these two fields are the same for every request type
|
||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
@@ -1694,11 +1623,8 @@ impl PagestreamFeMessage {
|
||||
},
|
||||
})),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
@@ -1707,11 +1633,8 @@ impl PagestreamFeMessage {
|
||||
},
|
||||
})),
|
||||
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
rel: RelTag {
|
||||
spcnode: body.read_u32::<BigEndian>()?,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
@@ -1721,174 +1644,61 @@ impl PagestreamFeMessage {
|
||||
blkno: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
dbnode: body.read_u32::<BigEndian>()?,
|
||||
})),
|
||||
4 => Ok(PagestreamFeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
kind: body.read_u8()?,
|
||||
segno: body.read_u32::<BigEndian>()?,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
5 => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key: body.read_u64::<BigEndian>()?,
|
||||
message: {
|
||||
let len = body.read_u64::<BigEndian>()?;
|
||||
let mut buf = vec![0; len as usize];
|
||||
body.read_exact(&mut buf)?;
|
||||
String::from_utf8(buf)?
|
||||
},
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamBeMessage {
|
||||
pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => {
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put(&resp.page[..])
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
PagestreamProtocolVersion::V3 => {
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.rel.spcnode);
|
||||
bytes.put_u32(resp.req.rel.dbnode);
|
||||
bytes.put_u32(resp.req.rel.relnode);
|
||||
bytes.put_u8(resp.req.rel.forknum);
|
||||
bytes.put_u32(resp.req.blkno);
|
||||
bytes.put(&resp.page[..])
|
||||
}
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put(&resp.page[..]);
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put_u64(resp.req.reqid);
|
||||
bytes.put_u64(resp.req.request_lsn.0);
|
||||
bytes.put_u64(resp.req.not_modified_since.0);
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u32(resp.req.dbnode);
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u8(resp.req.kind);
|
||||
bytes.put_u32(resp.req.segno);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(resp) => {
|
||||
bytes.put_u8(Tag::Test as u8);
|
||||
bytes.put_u64(resp.req.hdr.reqid);
|
||||
bytes.put_u64(resp.req.hdr.request_lsn.0);
|
||||
bytes.put_u64(resp.req.hdr.not_modified_since.0);
|
||||
bytes.put_u64(resp.req.batch_key);
|
||||
let message = resp.req.message.as_bytes();
|
||||
bytes.put_u64(message.len() as u64);
|
||||
bytes.put_slice(message);
|
||||
}
|
||||
}
|
||||
Self::GetSlruSegment(resp) => {
|
||||
bytes.put_u8(Tag::GetSlruSegment as u8);
|
||||
bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
|
||||
bytes.put(&resp.segment[..]);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
@@ -1900,156 +1710,41 @@ impl PagestreamBeMessage {
|
||||
let ok =
|
||||
match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
|
||||
Tag::Exists => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let exists = buf.read_u8()? != 0;
|
||||
let exists = buf.read_u8()?;
|
||||
Self::Exists(PagestreamExistsResponse {
|
||||
req: PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
},
|
||||
exists,
|
||||
exists: exists != 0,
|
||||
})
|
||||
}
|
||||
Tag::Nblocks => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
Self::Nblocks(PagestreamNblocksResponse {
|
||||
req: PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
},
|
||||
n_blocks,
|
||||
})
|
||||
Self::Nblocks(PagestreamNblocksResponse { n_blocks })
|
||||
}
|
||||
Tag::GetPage => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let rel = RelTag {
|
||||
spcnode: buf.read_u32::<BigEndian>()?,
|
||||
dbnode: buf.read_u32::<BigEndian>()?,
|
||||
relnode: buf.read_u32::<BigEndian>()?,
|
||||
forknum: buf.read_u8()?,
|
||||
};
|
||||
let blkno = buf.read_u32::<BigEndian>()?;
|
||||
let mut page = vec![0; 8192]; // TODO: use MaybeUninit
|
||||
buf.read_exact(&mut page)?;
|
||||
Self::GetPage(PagestreamGetPageResponse {
|
||||
req: PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
rel,
|
||||
blkno,
|
||||
},
|
||||
page: page.into(),
|
||||
})
|
||||
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
|
||||
}
|
||||
Tag::Error => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let mut msg = Vec::new();
|
||||
buf.read_until(0, &mut msg)?;
|
||||
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
|
||||
let rust_str = cstring.to_str()?;
|
||||
Self::Error(PagestreamErrorResponse {
|
||||
req: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: rust_str.to_owned(),
|
||||
})
|
||||
}
|
||||
Tag::DbSize => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let dbnode = buf.read_u32::<BigEndian>()?;
|
||||
let db_size = buf.read_i64::<BigEndian>()?;
|
||||
Self::DbSize(PagestreamDbSizeResponse {
|
||||
req: PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
dbnode,
|
||||
},
|
||||
db_size,
|
||||
})
|
||||
Self::DbSize(PagestreamDbSizeResponse { db_size })
|
||||
}
|
||||
Tag::GetSlruSegment => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let kind = buf.read_u8()?;
|
||||
let segno = buf.read_u32::<BigEndian>()?;
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
|
||||
buf.read_exact(&mut segment)?;
|
||||
Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
|
||||
req: PagestreamGetSlruSegmentRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
kind,
|
||||
segno,
|
||||
},
|
||||
segment: segment.into(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Tag::Test => {
|
||||
let reqid = buf.read_u64::<BigEndian>()?;
|
||||
let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
|
||||
let batch_key = buf.read_u64::<BigEndian>()?;
|
||||
let len = buf.read_u64::<BigEndian>()?;
|
||||
let mut msg = vec![0; len as usize];
|
||||
buf.read_exact(&mut msg)?;
|
||||
let message = String::from_utf8(msg)?;
|
||||
Self::Test(PagestreamTestResponse {
|
||||
req: PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
},
|
||||
batch_key,
|
||||
message,
|
||||
},
|
||||
})
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
@@ -2069,8 +1764,6 @@ impl PagestreamBeMessage {
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
Self::GetSlruSegment(_) => "GetSlruSegment",
|
||||
#[cfg(feature = "testing")]
|
||||
Self::Test(_) => "Test",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2087,11 +1780,8 @@ mod tests {
|
||||
// Test serialization/deserialization of PagestreamFeMessage
|
||||
let messages = vec![
|
||||
PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
@@ -2100,11 +1790,8 @@ mod tests {
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(4),
|
||||
},
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(4),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
@@ -2113,11 +1800,8 @@ mod tests {
|
||||
},
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
rel: RelTag {
|
||||
forknum: 1,
|
||||
spcnode: 2,
|
||||
@@ -2127,19 +1811,14 @@ mod tests {
|
||||
blkno: 7,
|
||||
}),
|
||||
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
},
|
||||
request_lsn: Lsn(4),
|
||||
not_modified_since: Lsn(3),
|
||||
dbnode: 7,
|
||||
}),
|
||||
];
|
||||
for msg in messages {
|
||||
let bytes = msg.serialize();
|
||||
let reconstructed =
|
||||
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
|
||||
.unwrap();
|
||||
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
||||
assert!(msg == reconstructed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);
|
||||
|
||||
impl ProtocolVersion {
|
||||
pub const fn new(major: u16, minor: u16) -> Self {
|
||||
Self(((major as u32) << 16) | minor as u32)
|
||||
Self((major as u32) << 16 | minor as u32)
|
||||
}
|
||||
pub const fn minor(self) -> u16 {
|
||||
self.0 as u16
|
||||
|
||||
@@ -43,17 +43,6 @@ impl RemoteStorageKind {
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
/// Helper to fetch the configured concurrency limit.
|
||||
pub fn concurrency_limit(&self) -> Option<usize> {
|
||||
match &self.storage {
|
||||
RemoteStorageKind::LocalFs { .. } => None,
|
||||
RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
|
||||
RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn default_timeout() -> Duration {
|
||||
RemoteStorageConfig::DEFAULT_TIMEOUT
|
||||
}
|
||||
@@ -126,15 +115,13 @@ fn default_max_keys_per_list_response() -> Option<i32> {
|
||||
}
|
||||
|
||||
fn default_azure_conn_pool_size() -> usize {
|
||||
// By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues
|
||||
// Conservative default: no connection pooling. At time of writing this is the Azure
|
||||
// SDK's default as well, due to historic reports of hard-to-reproduce issues
|
||||
// (https://github.com/hyperium/hyper/issues/2312)
|
||||
//
|
||||
// However, using connection pooling is important to avoid exhausting client ports when
|
||||
// doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971)
|
||||
//
|
||||
// We therefore enable a modest pool size by default: this may be configured to zero if
|
||||
// issues like the alleged upstream hyper issue appear.
|
||||
8
|
||||
0
|
||||
}
|
||||
|
||||
impl Debug for S3Config {
|
||||
|
||||
@@ -38,6 +38,7 @@ pub mod http;
|
||||
|
||||
use opentelemetry::trace::TracerProvider;
|
||||
use opentelemetry::KeyValue;
|
||||
use opentelemetry_sdk::Resource;
|
||||
use tracing::Subscriber;
|
||||
use tracing_subscriber::registry::LookupSpan;
|
||||
use tracing_subscriber::Layer;
|
||||
@@ -120,10 +121,7 @@ where
|
||||
S: Subscriber + for<'span> LookupSpan<'span>,
|
||||
{
|
||||
// Sets up exporter from the OTEL_EXPORTER_* environment variables.
|
||||
let exporter = opentelemetry_otlp::SpanExporter::builder()
|
||||
.with_http()
|
||||
.build()
|
||||
.expect("could not initialize opentelemetry exporter");
|
||||
let exporter = opentelemetry_otlp::new_exporter().http();
|
||||
|
||||
// TODO: opentelemetry::global::set_error_handler() with custom handler that
|
||||
// bypasses default tracing layers, but logs regular looking log
|
||||
@@ -134,13 +132,17 @@ where
|
||||
opentelemetry_sdk::propagation::TraceContextPropagator::new(),
|
||||
);
|
||||
|
||||
let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
|
||||
.with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
|
||||
.with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
|
||||
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
|
||||
service_name,
|
||||
)]))
|
||||
.build()
|
||||
let tracer = opentelemetry_otlp::new_pipeline()
|
||||
.tracing()
|
||||
.with_exporter(exporter)
|
||||
.with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
|
||||
Resource::new(vec![KeyValue::new(
|
||||
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
|
||||
service_name,
|
||||
)]),
|
||||
))
|
||||
.install_batch(opentelemetry_sdk::runtime::Tokio)
|
||||
.expect("could not initialize opentelemetry exporter")
|
||||
.tracer("global");
|
||||
|
||||
tracing_opentelemetry::layer().with_tracer(tracer)
|
||||
|
||||
@@ -26,7 +26,6 @@ git-version.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
humantime.workspace = true
|
||||
hyper0 = { workspace = true, features = ["full"] }
|
||||
inferno.workspace = true
|
||||
itertools.workspace = true
|
||||
fail.workspace = true
|
||||
futures = { workspace = true }
|
||||
|
||||
@@ -112,9 +112,9 @@ impl Serialize for Generation {
|
||||
// We should never be asked to serialize a None. Structures
|
||||
// that include an optional generation should convert None to an
|
||||
// Option<Generation>::None
|
||||
Err(serde::ser::Error::custom(format!(
|
||||
"Tried to serialize invalid generation ({self:?})"
|
||||
)))
|
||||
Err(serde::ser::Error::custom(
|
||||
"Tried to serialize invalid generation ({self})",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::sync::{mpsc, Mutex, Notify};
|
||||
use tokio::sync::{mpsc, Mutex};
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
@@ -350,53 +350,33 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
|
||||
};
|
||||
let seconds = match parse_query_param(&req, "seconds")? {
|
||||
None => 5,
|
||||
Some(seconds @ 1..=60) => seconds,
|
||||
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
|
||||
Some(seconds @ 1..=30) => seconds,
|
||||
Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
|
||||
};
|
||||
let frequency_hz = match parse_query_param(&req, "frequency")? {
|
||||
None => 99,
|
||||
Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
|
||||
Some(frequency) => frequency,
|
||||
};
|
||||
let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();
|
||||
|
||||
// Only allow one profiler at a time.
|
||||
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
|
||||
let _lock = PROFILE_LOCK
|
||||
.try_lock()
|
||||
.map_err(|_| ApiError::Conflict("profiler already running".into()))?;
|
||||
|
||||
// Take the profile.
|
||||
static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
|
||||
static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
|
||||
|
||||
let report = {
|
||||
// Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
|
||||
// Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
|
||||
// for a lock(), to avoid races where the notify isn't currently awaited.
|
||||
let _lock = loop {
|
||||
match PROFILE_LOCK.try_lock() {
|
||||
Ok(lock) => break lock,
|
||||
Err(_) if force => PROFILE_CANCEL.notify_waiters(),
|
||||
Err(_) => {
|
||||
return Err(ApiError::Conflict(
|
||||
"profiler already running (use ?force=true to cancel it)".into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
|
||||
};
|
||||
|
||||
let report = tokio::task::spawn_blocking(move || {
|
||||
let guard = ProfilerGuardBuilder::default()
|
||||
.frequency(frequency_hz)
|
||||
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
|
||||
.build()
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
|
||||
_ = PROFILE_CANCEL.notified() => {},
|
||||
};
|
||||
|
||||
guard
|
||||
.report()
|
||||
.build()
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?
|
||||
};
|
||||
.build()?;
|
||||
std::thread::sleep(Duration::from_secs(seconds));
|
||||
guard.report().build()
|
||||
})
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
|
||||
|
||||
// Return the report in the requested format.
|
||||
match format {
|
||||
@@ -437,7 +417,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
|
||||
enum Format {
|
||||
Jemalloc,
|
||||
Pprof,
|
||||
Svg,
|
||||
}
|
||||
|
||||
// Parameters.
|
||||
@@ -445,24 +424,9 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
|
||||
None => Format::Pprof,
|
||||
Some("jemalloc") => Format::Jemalloc,
|
||||
Some("pprof") => Format::Pprof,
|
||||
Some("svg") => Format::Svg,
|
||||
Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
|
||||
};
|
||||
|
||||
// Functions and mappings to strip when symbolizing pprof profiles. If true,
|
||||
// also remove child frames.
|
||||
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
|
||||
vec![
|
||||
(Regex::new("^__rust").unwrap(), false),
|
||||
(Regex::new("^_start$").unwrap(), false),
|
||||
(Regex::new("^irallocx_prof").unwrap(), true),
|
||||
(Regex::new("^prof_alloc_prep").unwrap(), true),
|
||||
(Regex::new("^std::rt::lang_start").unwrap(), false),
|
||||
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
|
||||
]
|
||||
});
|
||||
const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
|
||||
|
||||
// Obtain profiler handle.
|
||||
let mut prof_ctl = jemalloc_pprof::PROF_CTL
|
||||
.as_ref()
|
||||
@@ -500,9 +464,24 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
|
||||
// Symbolize the profile.
|
||||
// TODO: consider moving this upstream to jemalloc_pprof and avoiding the
|
||||
// serialization roundtrip.
|
||||
static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
|
||||
// Functions to strip from profiles. If true, also remove child frames.
|
||||
vec![
|
||||
(Regex::new("^__rust").unwrap(), false),
|
||||
(Regex::new("^_start$").unwrap(), false),
|
||||
(Regex::new("^irallocx_prof").unwrap(), true),
|
||||
(Regex::new("^prof_alloc_prep").unwrap(), true),
|
||||
(Regex::new("^std::rt::lang_start").unwrap(), false),
|
||||
(Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
|
||||
]
|
||||
});
|
||||
let profile = pprof::decode(&bytes)?;
|
||||
let profile = pprof::symbolize(profile)?;
|
||||
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
|
||||
let profile = pprof::strip_locations(
|
||||
profile,
|
||||
&["libc", "libgcc", "pthread", "vdso"],
|
||||
&STRIP_FUNCTIONS,
|
||||
);
|
||||
pprof::encode(&profile)
|
||||
})
|
||||
.await
|
||||
@@ -515,27 +494,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
|
||||
.body(Body::from(data))
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))
|
||||
}
|
||||
|
||||
Format::Svg => {
|
||||
let body = tokio::task::spawn_blocking(move || {
|
||||
let bytes = prof_ctl.dump_pprof()?;
|
||||
let profile = pprof::decode(&bytes)?;
|
||||
let profile = pprof::symbolize(profile)?;
|
||||
let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
|
||||
let mut opts = inferno::flamegraph::Options::default();
|
||||
opts.title = "Heap inuse".to_string();
|
||||
opts.count_name = "bytes".to_string();
|
||||
pprof::flamegraph(profile, &mut opts)
|
||||
})
|
||||
.await
|
||||
.map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, "image/svg+xml")
|
||||
.body(Body::from(body))
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -260,7 +260,7 @@ impl FromStr for Lsn {
|
||||
{
|
||||
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
|
||||
let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
|
||||
Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
|
||||
Ok(Lsn((left_num as u64) << 32 | right_num as u64))
|
||||
} else {
|
||||
Err(LsnParseError)
|
||||
}
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
use anyhow::bail;
|
||||
use flate2::write::{GzDecoder, GzEncoder};
|
||||
use flate2::Compression;
|
||||
use itertools::Itertools as _;
|
||||
use once_cell::sync::Lazy;
|
||||
use pprof::protos::{Function, Line, Location, Message as _, Profile};
|
||||
use pprof::protos::{Function, Line, Message as _, Profile};
|
||||
use regex::Regex;
|
||||
|
||||
use std::borrow::Cow;
|
||||
@@ -189,59 +188,3 @@ pub fn strip_locations(
|
||||
|
||||
profile
|
||||
}
|
||||
|
||||
/// Generates an SVG flamegraph from a symbolized pprof profile.
|
||||
pub fn flamegraph(
|
||||
profile: Profile,
|
||||
opts: &mut inferno::flamegraph::Options,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
if profile.mapping.iter().any(|m| !m.has_functions) {
|
||||
bail!("profile not symbolized");
|
||||
}
|
||||
|
||||
// Index locations, functions, and strings.
|
||||
let locations: HashMap<u64, Location> =
|
||||
profile.location.into_iter().map(|l| (l.id, l)).collect();
|
||||
let functions: HashMap<u64, Function> =
|
||||
profile.function.into_iter().map(|f| (f.id, f)).collect();
|
||||
let strings = profile.string_table;
|
||||
|
||||
// Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
|
||||
// since inferno expects it bottom-up.
|
||||
let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
|
||||
for sample in profile.sample {
|
||||
let mut stack = Vec::with_capacity(sample.location_id.len());
|
||||
for location in sample.location_id.into_iter().rev() {
|
||||
let Some(location) = locations.get(&location) else {
|
||||
bail!("missing location {location}");
|
||||
};
|
||||
for line in location.line.iter().rev() {
|
||||
let Some(function) = functions.get(&line.function_id) else {
|
||||
bail!("missing function {}", line.function_id);
|
||||
};
|
||||
let Some(name) = strings.get(function.name as usize) else {
|
||||
bail!("missing string {}", function.name);
|
||||
};
|
||||
stack.push(name.as_str());
|
||||
}
|
||||
}
|
||||
let Some(&value) = sample.value.first() else {
|
||||
bail!("missing value");
|
||||
};
|
||||
*stacks.entry(stack).or_default() += value;
|
||||
}
|
||||
|
||||
// Construct stack lines for inferno.
|
||||
let lines = stacks
|
||||
.into_iter()
|
||||
.map(|(stack, value)| (stack.into_iter().join(";"), value))
|
||||
.map(|(stack, value)| format!("{stack} {value}"))
|
||||
.sorted()
|
||||
.collect_vec();
|
||||
|
||||
// Construct the flamegraph.
|
||||
let mut bytes = Vec::new();
|
||||
let lines = lines.iter().map(|line| line.as_str());
|
||||
inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
|
||||
Ok(bytes)
|
||||
}
|
||||
|
||||
@@ -96,11 +96,7 @@ impl<T: Send> Sender<T> {
|
||||
}
|
||||
}
|
||||
State::SenderWaitsForReceiverToConsume(_data) => {
|
||||
// SAFETY: send is single threaded due to `&mut self` requirement,
|
||||
// therefore register is not concurrent.
|
||||
unsafe {
|
||||
self.state.wake_sender.register(cx.waker());
|
||||
}
|
||||
// Really, we shouldn't be polled until receiver has consumed and wakes us.
|
||||
Poll::Pending
|
||||
}
|
||||
State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
|
||||
@@ -453,38 +449,4 @@ mod tests {
|
||||
let err = recv_task.await.unwrap().expect_err("should error");
|
||||
assert!(matches!(err, RecvError::SenderGone));
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() {
|
||||
let (mut sender, receiver) = channel();
|
||||
|
||||
let state = receiver.state.clone();
|
||||
|
||||
sender.send((), |_, _| unreachable!()).await.unwrap();
|
||||
|
||||
assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_)));
|
||||
|
||||
let unmergeable = sender.send((), |_, _| Err(()));
|
||||
let mut unmergeable = std::pin::pin!(unmergeable);
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(FOREVER) => {},
|
||||
_ = &mut unmergeable => {
|
||||
panic!("unmergeable should not complete");
|
||||
},
|
||||
}
|
||||
|
||||
assert!(matches!(
|
||||
&*state.value.lock().unwrap(),
|
||||
&State::SenderWaitsForReceiverToConsume(_)
|
||||
));
|
||||
|
||||
drop(receiver);
|
||||
|
||||
assert!(matches!(
|
||||
&*state.value.lock().unwrap(),
|
||||
&State::ReceiverGone
|
||||
));
|
||||
|
||||
unmergeable.await.unwrap_err();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,14 +95,6 @@ impl InterpretedWalRecord {
|
||||
&& self.metadata_record.is_none()
|
||||
&& matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
|
||||
}
|
||||
|
||||
/// Checks if the WAL record is observed (i.e. contains only metadata
|
||||
/// for observed values)
|
||||
pub fn is_observed(&self) -> bool {
|
||||
self.batch.is_observed()
|
||||
&& self.metadata_record.is_none()
|
||||
&& matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
|
||||
}
|
||||
}
|
||||
|
||||
/// The interpreted part of the Postgres WAL record which requires metadata
|
||||
|
||||
@@ -501,11 +501,6 @@ impl SerializedValueBatch {
|
||||
!self.has_data() && self.metadata.is_empty()
|
||||
}
|
||||
|
||||
/// Checks if the batch contains only observed values
|
||||
pub fn is_observed(&self) -> bool {
|
||||
!self.has_data() && !self.metadata.is_empty()
|
||||
}
|
||||
|
||||
/// Checks if the batch contains data
|
||||
///
|
||||
/// Note that if this returns false, it may still contain observed values or
|
||||
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -44,7 +44,6 @@ postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
postgres_initdb.workspace = true
|
||||
pprof.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
@@ -109,11 +108,3 @@ harness = false
|
||||
[[bench]]
|
||||
name = "bench_ingest"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "upload_queue"
|
||||
harness = false
|
||||
|
||||
[[bin]]
|
||||
name = "test_helper_slow_client_reads"
|
||||
required-features = [ "testing" ]
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
//! Upload queue benchmarks.
|
||||
|
||||
use std::str::FromStr as _;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Bencher, Criterion};
|
||||
use pageserver::tenant::metadata::TimelineMetadata;
|
||||
use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use utils::generation::Generation;
|
||||
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
|
||||
|
||||
// Register benchmarks with Criterion.
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = bench_upload_queue_next_ready,
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
|
||||
/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
|
||||
/// queue as a whole is thus quadratic.
|
||||
///
|
||||
/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
|
||||
/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
|
||||
fn bench_upload_queue_next_ready(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("upload_queue_next_ready");
|
||||
for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
|
||||
g.bench_function(format!("inprogress={inprogress}"), |b| {
|
||||
run_bench(b, inprogress).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
|
||||
// Construct two layers. layer0 is in the indexes, layer1 will be deleted.
|
||||
let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
|
||||
let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
|
||||
|
||||
let metadata = LayerFileMetadata {
|
||||
shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
|
||||
generation: Generation::Valid(1),
|
||||
file_size: 0,
|
||||
};
|
||||
|
||||
// Construct the (initial and uploaded) index with layer0.
|
||||
let mut index = IndexPart::empty(TimelineMetadata::example());
|
||||
index.layer_metadata.insert(layer0, metadata.clone());
|
||||
|
||||
// Construct the queue.
|
||||
let mut queue = UploadQueue::Uninitialized;
|
||||
let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
|
||||
|
||||
// Populate inprogress_tasks with a bunch of layer1 deletions.
|
||||
let delete = UploadOp::Delete(Delete {
|
||||
layers: vec![(layer1, metadata)],
|
||||
});
|
||||
|
||||
for task_id in 0..(inprogress as u64) {
|
||||
queue.inprogress_tasks.insert(
|
||||
task_id,
|
||||
Arc::new(UploadTask {
|
||||
task_id,
|
||||
retries: AtomicU32::new(0),
|
||||
op: delete.clone(),
|
||||
coalesced_ops: Vec::new(),
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Benchmark index upload scheduling.
|
||||
let index_upload = UploadOp::UploadMetadata {
|
||||
uploaded: Box::new(index),
|
||||
};
|
||||
|
||||
b.iter(|| {
|
||||
queue.queued_operations.push_front(index_upload.clone());
|
||||
assert!(queue.next_ready().is_some());
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -4,9 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = [ "pageserver_api/testing" ]
|
||||
|
||||
[dependencies]
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::pin::Pin;
|
||||
|
||||
use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
@@ -13,6 +10,7 @@ use pageserver_api::{
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -62,30 +60,17 @@ impl Client {
|
||||
) -> anyhow::Result<PagestreamClient> {
|
||||
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
|
||||
.client
|
||||
.copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
|
||||
.copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
|
||||
.await?;
|
||||
let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
|
||||
let Client {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
client: _,
|
||||
} = self;
|
||||
let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
|
||||
ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
},
|
||||
)));
|
||||
Ok(PagestreamClient {
|
||||
sink: PagestreamSender {
|
||||
shared: shared.clone(),
|
||||
sink,
|
||||
},
|
||||
stream: PagestreamReceiver {
|
||||
shared: shared.clone(),
|
||||
stream,
|
||||
},
|
||||
shared,
|
||||
copy_both: Box::pin(copy_both),
|
||||
conn_task,
|
||||
cancel_on_client_drop,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -112,28 +97,7 @@ impl Client {
|
||||
|
||||
/// Create using [`Client::pagestream`].
|
||||
pub struct PagestreamClient {
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: PagestreamSender,
|
||||
stream: PagestreamReceiver,
|
||||
}
|
||||
|
||||
pub struct PagestreamSender {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
|
||||
}
|
||||
|
||||
pub struct PagestreamReceiver {
|
||||
#[allow(dead_code)]
|
||||
shared: Arc<Mutex<PagestreamShared>>,
|
||||
stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
|
||||
}
|
||||
|
||||
enum PagestreamShared {
|
||||
ConnTaskRunning(ConnTaskRunning),
|
||||
ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
}
|
||||
struct ConnTaskRunning {
|
||||
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
@@ -146,11 +110,11 @@ pub struct RelTagBlockNo {
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(self) {
|
||||
let Self {
|
||||
shared,
|
||||
sink,
|
||||
stream,
|
||||
} = { self };
|
||||
// The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
copy_both,
|
||||
cancel_on_client_drop: cancel_conn_task,
|
||||
conn_task,
|
||||
} = self;
|
||||
// The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
|
||||
// When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
|
||||
// (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
|
||||
//
|
||||
@@ -167,77 +131,27 @@ impl PagestreamClient {
|
||||
//
|
||||
// NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
|
||||
// => https://github.com/neondatabase/neon/issues/6390
|
||||
let ConnTaskRunning {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
} = {
|
||||
let mut guard = shared.lock().unwrap();
|
||||
match std::mem::replace(
|
||||
&mut *guard,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
|
||||
) {
|
||||
PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
|
||||
PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
|
||||
}
|
||||
};
|
||||
let _ = cancel_on_client_drop.unwrap();
|
||||
let _ = cancel_conn_task.unwrap();
|
||||
conn_task.await.unwrap();
|
||||
|
||||
// Now drop the split copy_both.
|
||||
drop(sink);
|
||||
drop(stream);
|
||||
}
|
||||
|
||||
pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
|
||||
let Self {
|
||||
shared: _,
|
||||
sink,
|
||||
stream,
|
||||
} = self;
|
||||
(sink, stream)
|
||||
drop(copy_both);
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.getpage_send(req).await?;
|
||||
self.getpage_recv().await
|
||||
}
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.sink.getpage_send(req).await
|
||||
}
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.stream.getpage_recv().await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamSender {
|
||||
// TODO: maybe make this impl Sink instead for better composability?
|
||||
pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
|
||||
let msg = msg.serialize();
|
||||
self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.send(PagestreamFeMessage::GetPage(req)).await
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamReceiver {
|
||||
// TODO: maybe make this impl Stream instead for better composability?
|
||||
pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
PagestreamBeMessage::deserialize(next)
|
||||
}
|
||||
|
||||
pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let next: PagestreamBeMessage = self.recv().await?;
|
||||
match next {
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
match msg {
|
||||
PagestreamBeMessage::GetPage(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
@@ -246,14 +160,7 @@ impl PagestreamReceiver {
|
||||
| PagestreamBeMessage::GetSlruSegment(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
next.kind()
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamBeMessage::Test(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
next.kind()
|
||||
msg.kind()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -322,15 +322,12 @@ async fn main_impl(
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
|
||||
@@ -53,12 +53,10 @@ project_build_tag!(BUILD_TAG);
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
|
||||
/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
|
||||
/// performance-sensitive code will avoid allocations as far as possible anyway.
|
||||
/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[export_name = "malloc_conf"]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
use std::{
|
||||
io::{stdin, stdout, Read, Write},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
struct Args {
|
||||
connstr: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let Args {
|
||||
connstr,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = Args::parse();
|
||||
let client = pageserver_client::page_service::Client::new(connstr).await?;
|
||||
let client = client.pagestream(tenant_id, timeline_id).await?;
|
||||
let (mut sender, _receiver) = client.split();
|
||||
|
||||
eprintln!("filling the pipe");
|
||||
let mut msg = 0;
|
||||
loop {
|
||||
msg += 1;
|
||||
let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
|
||||
PagestreamTestRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: Lsn(23),
|
||||
not_modified_since: Lsn(23),
|
||||
},
|
||||
batch_key: 42,
|
||||
message: format!("message {}", msg),
|
||||
},
|
||||
));
|
||||
let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
|
||||
eprintln!("pipe seems full");
|
||||
break;
|
||||
};
|
||||
let _: () = res?;
|
||||
}
|
||||
|
||||
let n = stdout().write(b"R")?;
|
||||
assert_eq!(n, 1);
|
||||
stdout().flush()?;
|
||||
|
||||
eprintln!("waiting for signal to tell us to exit");
|
||||
|
||||
let mut buf = [0u8; 1];
|
||||
stdin().read_exact(&mut buf)?;
|
||||
|
||||
eprintln!("termination signal received, exiting");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use pageserver_api::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
|
||||
TimelineInfo,
|
||||
CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
|
||||
TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
@@ -2052,7 +2052,15 @@ async fn timeline_compact_info_handler(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
|
||||
let res = tenant.get_scheduled_compaction_tasks(timeline_id);
|
||||
let mut resp = Vec::new();
|
||||
for item in res {
|
||||
resp.push(CompactInfoResponse {
|
||||
compact_key_range: item.compact_key_range,
|
||||
compact_lsn_range: item.compact_lsn_range,
|
||||
sub_compaction: item.sub_compaction,
|
||||
});
|
||||
}
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
.instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
|
||||
@@ -91,6 +91,15 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_visited_per_read_global",
|
||||
"Number of layers visited to reconstruct one key",
|
||||
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_visited_per_vectored_read_global",
|
||||
@@ -1224,189 +1233,117 @@ pub(crate) struct SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
|
||||
throttling: Arc<tenant_throttling::Pagestream>,
|
||||
|
||||
timings: SmgrOpTimerState,
|
||||
}
|
||||
|
||||
/// The stages of request processing are represented by the enum variants.
|
||||
/// Used as part of [`SmgrOpTimerInner::timings`].
|
||||
///
|
||||
/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
|
||||
/// transition points.
|
||||
/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
|
||||
/// to the next state.
|
||||
///
|
||||
/// Each request goes through every stage, in all configurations.
|
||||
///
|
||||
#[derive(Debug)]
|
||||
enum SmgrOpTimerState {
|
||||
Received {
|
||||
// In the future, we may want to track the full time the request spent
|
||||
// inside pageserver process (time spent in kernel buffers can't be tracked).
|
||||
// `received_at` would be used for that.
|
||||
#[allow(dead_code)]
|
||||
received_at: Instant,
|
||||
},
|
||||
Throttling {
|
||||
ThrottleDoneExecutionStarting {
|
||||
received_at: Instant,
|
||||
throttle_started_at: Instant,
|
||||
started_execution_at: Instant,
|
||||
},
|
||||
Batching {
|
||||
throttle_done_at: Instant,
|
||||
},
|
||||
Executing {
|
||||
execution_started_at: Instant,
|
||||
},
|
||||
Flushing,
|
||||
// NB: when adding observation points, remember to update the Drop impl.
|
||||
}
|
||||
|
||||
// NB: when adding observation points, remember to update the Drop impl.
|
||||
impl SmgrOpTimer {
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
|
||||
return;
|
||||
};
|
||||
inner.throttling.count_accounted_start.inc();
|
||||
inner.timings = SmgrOpTimerState::Throttling {
|
||||
throttle_started_at: at,
|
||||
};
|
||||
}
|
||||
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Throttling {
|
||||
throttle_started_at,
|
||||
} = &inner.timings
|
||||
else {
|
||||
return;
|
||||
};
|
||||
inner.throttling.count_accounted_finish.inc();
|
||||
match throttle {
|
||||
ThrottleResult::NotThrottled { end } => {
|
||||
inner.timings = SmgrOpTimerState::Batching {
|
||||
throttle_done_at: end,
|
||||
};
|
||||
}
|
||||
ThrottleResult::Throttled { end } => {
|
||||
// update metrics
|
||||
inner.throttling.count_throttled.inc();
|
||||
inner
|
||||
.throttling
|
||||
.wait_time
|
||||
.inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Batching {
|
||||
throttle_done_at: end,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_execution_start(&mut self, at: Instant) {
|
||||
let Some(inner) = self.0.as_mut() else {
|
||||
return;
|
||||
};
|
||||
let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
|
||||
return;
|
||||
};
|
||||
// update metrics
|
||||
let batch = at - *throttle_done_at;
|
||||
inner.global_batch_wait_time.observe(batch.as_secs_f64());
|
||||
inner
|
||||
.per_timeline_batch_wait_time
|
||||
.observe(batch.as_secs_f64());
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Executing {
|
||||
execution_started_at: at,
|
||||
}
|
||||
}
|
||||
|
||||
/// For all but the first caller, this is a no-op.
|
||||
/// The first callers receives Some, subsequent ones None.
|
||||
///
|
||||
/// See [`SmgrOpTimerState`] for more context.
|
||||
pub(crate) fn observe_execution_end_flush_start(
|
||||
&mut self,
|
||||
at: Instant,
|
||||
) -> Option<SmgrOpFlushInProgress> {
|
||||
// NB: unlike the other observe_* methods, this one take()s.
|
||||
#[allow(clippy::question_mark)] // maintain similar code pattern.
|
||||
let Some(mut inner) = self.0.take() else {
|
||||
return None;
|
||||
};
|
||||
let SmgrOpTimerState::Executing {
|
||||
execution_started_at,
|
||||
} = &inner.timings
|
||||
else {
|
||||
return None;
|
||||
};
|
||||
// update metrics
|
||||
let execution = at - *execution_started_at;
|
||||
inner
|
||||
.global_execution_latency_histo
|
||||
.observe(execution.as_secs_f64());
|
||||
if let Some(per_timeline_execution_latency_histo) =
|
||||
&inner.per_timeline_execution_latency_histo
|
||||
{
|
||||
per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
|
||||
}
|
||||
|
||||
// state transition
|
||||
inner.timings = SmgrOpTimerState::Flushing;
|
||||
|
||||
// return the flush in progress object which
|
||||
// will do the remaining metrics updates
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
Some(SmgrOpFlushInProgress {
|
||||
flush_started_at: at,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The last stage of request processing is serializing and flushing the request
|
||||
/// into the TCP connection. We want to make slow flushes observable
|
||||
/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
|
||||
/// to periodically bump the metric.
|
||||
///
|
||||
/// If in the future we decide that we're not interested in live updates, we can
|
||||
/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
|
||||
/// and remove this struct from the code base.
|
||||
pub(crate) struct SmgrOpFlushInProgress {
|
||||
flush_started_at: Instant,
|
||||
global_micros: IntCounter,
|
||||
per_timeline_micros: IntCounter,
|
||||
}
|
||||
|
||||
impl SmgrOpTimer {
|
||||
pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
|
||||
let inner = self.0.as_mut().expect("other public methods consume self");
|
||||
match (&mut inner.timings, throttle) {
|
||||
(SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
|
||||
ThrottleResult::NotThrottled { start } => {
|
||||
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at: *received_at,
|
||||
throttle_started_at: *start,
|
||||
started_execution_at: *start,
|
||||
};
|
||||
}
|
||||
ThrottleResult::Throttled { start, end } => {
|
||||
inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at: *start,
|
||||
throttle_started_at: *start,
|
||||
started_execution_at: *end,
|
||||
};
|
||||
}
|
||||
},
|
||||
(x, _) => panic!("called in unexpected state: {x:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
|
||||
let (flush_start, inner) = self
|
||||
.smgr_op_end()
|
||||
.expect("this method consume self, and the only other caller is drop handler");
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
SmgrOpFlushInProgress {
|
||||
flush_started_at: flush_start,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `None`` if this method has already been called, `Some` otherwise.
|
||||
fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
|
||||
let inner = self.0.take()?;
|
||||
|
||||
let now = Instant::now();
|
||||
|
||||
let batch;
|
||||
let execution;
|
||||
let throttle;
|
||||
match inner.timings {
|
||||
SmgrOpTimerState::Received { received_at } => {
|
||||
batch = (now - received_at).as_secs_f64();
|
||||
// TODO: use label for dropped requests.
|
||||
// This is quite rare in practice, only during tenant/pageservers shutdown.
|
||||
throttle = Duration::ZERO;
|
||||
execution = Duration::ZERO.as_secs_f64();
|
||||
}
|
||||
SmgrOpTimerState::ThrottleDoneExecutionStarting {
|
||||
received_at,
|
||||
throttle_started_at,
|
||||
started_execution_at,
|
||||
} => {
|
||||
batch = (throttle_started_at - received_at).as_secs_f64();
|
||||
throttle = started_execution_at - throttle_started_at;
|
||||
execution = (now - started_execution_at).as_secs_f64();
|
||||
}
|
||||
}
|
||||
|
||||
// update time spent in batching
|
||||
inner.global_batch_wait_time.observe(batch);
|
||||
inner.per_timeline_batch_wait_time.observe(batch);
|
||||
|
||||
// time spent in throttle metric is updated by throttle impl
|
||||
let _ = throttle;
|
||||
|
||||
// update metrics for execution latency
|
||||
inner.global_execution_latency_histo.observe(execution);
|
||||
if let Some(per_timeline_execution_latency_histo) =
|
||||
&inner.per_timeline_execution_latency_histo
|
||||
{
|
||||
per_timeline_execution_latency_histo.observe(execution);
|
||||
}
|
||||
|
||||
Some((now, inner))
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for SmgrOpTimer {
|
||||
fn drop(&mut self) {
|
||||
// In case of early drop, update any of the remaining metrics with
|
||||
// observations so that (started,finished) counter pairs balance out
|
||||
// and all counters on the latency path have the the same number of
|
||||
// observations.
|
||||
// It's technically lying and it would be better if each metric had
|
||||
// a separate label or similar for cancelled requests.
|
||||
// But we don't have that right now and counter pairs balancing
|
||||
// out is useful when using the metrics in panels and whatnot.
|
||||
let now = Instant::now();
|
||||
self.observe_throttle_start(now);
|
||||
self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
|
||||
self.observe_execution_start(now);
|
||||
self.observe_execution_end_flush_start(now);
|
||||
self.smgr_op_end();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1417,12 +1354,12 @@ impl SmgrOpFlushInProgress {
|
||||
{
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
let now = Instant::now();
|
||||
// Whenever observe_guard gets called, or dropped,
|
||||
// it adds the time elapsed since its last call to metrics.
|
||||
// Last call is tracked in `now`.
|
||||
let mut observe_guard = scopeguard::guard(
|
||||
|| {
|
||||
let now = Instant::now();
|
||||
let elapsed = now - self.flush_started_at;
|
||||
self.global_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
@@ -1463,10 +1400,9 @@ pub enum SmgrQueryType {
|
||||
GetPageAtLsn,
|
||||
GetDbSize,
|
||||
GetSlruSegment,
|
||||
#[cfg(feature = "testing")]
|
||||
Test,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
global_started: [IntCounter; SmgrQueryType::COUNT],
|
||||
global_latency: [Histogram; SmgrQueryType::COUNT],
|
||||
@@ -1478,7 +1414,6 @@ pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
global_batch_wait_time: Histogram,
|
||||
per_timeline_batch_wait_time: Histogram,
|
||||
throttling: Arc<tenant_throttling::Pagestream>,
|
||||
}
|
||||
|
||||
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -1684,11 +1619,7 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(||
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
|
||||
) -> Self {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_slug = format!("{}", tenant_shard_id.shard_slug());
|
||||
let timeline_id = timeline_id.to_string();
|
||||
@@ -1749,7 +1680,6 @@ impl SmgrQueryTimePerTimeline {
|
||||
per_timeline_flush_in_progress_micros,
|
||||
global_batch_wait_time,
|
||||
per_timeline_batch_wait_time,
|
||||
throttling: pagestream_throttle_metrics,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
|
||||
@@ -1765,24 +1695,88 @@ impl SmgrQueryTimePerTimeline {
|
||||
SmgrOpTimer(Some(SmgrOpTimerInner {
|
||||
global_execution_latency_histo: self.global_latency[op as usize].clone(),
|
||||
per_timeline_execution_latency_histo: per_timeline_latency_histo,
|
||||
timings: SmgrOpTimerState::Received { received_at },
|
||||
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
|
||||
per_timeline_flush_in_progress_micros: self
|
||||
.per_timeline_flush_in_progress_micros
|
||||
.clone(),
|
||||
global_batch_wait_time: self.global_batch_wait_time.clone(),
|
||||
per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
|
||||
throttling: self.throttling.clone(),
|
||||
timings: SmgrOpTimerState::Received { received_at },
|
||||
}))
|
||||
}
|
||||
|
||||
/// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
|
||||
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
|
||||
self.global_batch_size.observe(batch_size as f64);
|
||||
self.per_timeline_batch_size.observe(batch_size as f64);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod smgr_query_time_tests {
|
||||
use std::time::Instant;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use strum::IntoEnumIterator;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
// Regression test, we used hard-coded string constants before using an enum.
|
||||
#[test]
|
||||
fn op_label_name() {
|
||||
use super::SmgrQueryType::*;
|
||||
let expect: [(super::SmgrQueryType, &'static str); 5] = [
|
||||
(GetRelExists, "get_rel_exists"),
|
||||
(GetRelSize, "get_rel_size"),
|
||||
(GetPageAtLsn, "get_page_at_lsn"),
|
||||
(GetDbSize, "get_db_size"),
|
||||
(GetSlruSegment, "get_slru_segment"),
|
||||
];
|
||||
for (op, expect) in expect {
|
||||
let actual: &'static str = op.into();
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let ops: Vec<_> = super::SmgrQueryType::iter().collect();
|
||||
|
||||
for op in &ops {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let metrics = super::SmgrQueryTimePerTimeline::new(
|
||||
&TenantShardId::unsharded(tenant_id),
|
||||
&timeline_id,
|
||||
);
|
||||
|
||||
let get_counts = || {
|
||||
let global: u64 = ops
|
||||
.iter()
|
||||
.map(|op| metrics.global_latency[*op as usize].get_sample_count())
|
||||
.sum();
|
||||
(
|
||||
global,
|
||||
metrics.per_timeline_getpage_latency.get_sample_count(),
|
||||
)
|
||||
};
|
||||
|
||||
let (pre_global, pre_per_tenant_timeline) = get_counts();
|
||||
assert_eq!(pre_per_tenant_timeline, 0);
|
||||
|
||||
let timer = metrics.start_smgr_op(*op, Instant::now());
|
||||
drop(timer);
|
||||
|
||||
let (post_global, post_per_tenant_timeline) = get_counts();
|
||||
if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
|
||||
// getpage ops are tracked per-timeline, others aren't
|
||||
assert_eq!(post_per_tenant_timeline, 1);
|
||||
} else {
|
||||
assert_eq!(post_per_tenant_timeline, 0);
|
||||
}
|
||||
assert!(post_global > pre_global);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
@@ -1860,7 +1854,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
|
||||
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
||||
pub(crate) enum ComputeCommandKind {
|
||||
PageStreamV3,
|
||||
PageStreamV2,
|
||||
Basebackup,
|
||||
Fullbackup,
|
||||
@@ -2344,15 +2337,13 @@ macro_rules! redo_bytes_histogram_count_buckets {
|
||||
pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) bytes_received: IntCounter,
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_observed: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
|
||||
pub(crate) clear_vm_bits_unknown: IntCounterVec,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
WalIngestMetrics {
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
bytes_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_bytes_received",
|
||||
"Bytes of WAL ingested from safekeepers",
|
||||
@@ -2363,11 +2354,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
"Number of WAL records received from safekeepers"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_observed: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_observed",
|
||||
"Number of WAL records observed from safekeepers. These are metadata only records for shard 0."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_committed: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_committed",
|
||||
"Number of WAL records which resulted in writes to pageserver storage"
|
||||
@@ -2389,7 +2375,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
&["entity"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -3578,7 +3563,9 @@ pub(crate) mod tenant_throttling {
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
pub(crate) struct GlobalAndPerTenantIntCounter {
|
||||
use crate::tenant::{self};
|
||||
|
||||
struct GlobalAndPerTenantIntCounter {
|
||||
global: IntCounter,
|
||||
per_tenant: IntCounter,
|
||||
}
|
||||
@@ -3596,10 +3583,10 @@ pub(crate) mod tenant_throttling {
|
||||
}
|
||||
|
||||
pub(crate) struct Metrics<const KIND: usize> {
|
||||
pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
pub(super) wait_time: GlobalAndPerTenantIntCounter,
|
||||
pub(super) count_throttled: GlobalAndPerTenantIntCounter,
|
||||
count_accounted_start: GlobalAndPerTenantIntCounter,
|
||||
count_accounted_finish: GlobalAndPerTenantIntCounter,
|
||||
wait_time: GlobalAndPerTenantIntCounter,
|
||||
count_throttled: GlobalAndPerTenantIntCounter,
|
||||
}
|
||||
|
||||
static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
@@ -3734,6 +3721,26 @@ pub(crate) mod tenant_throttling {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
|
||||
#[inline(always)]
|
||||
fn accounting_start(&self) {
|
||||
self.count_accounted_start.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn accounting_finish(&self) {
|
||||
self.count_accounted_finish.inc();
|
||||
}
|
||||
#[inline(always)]
|
||||
fn observe_throttling(
|
||||
&self,
|
||||
tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
|
||||
) {
|
||||
let val = u64::try_from(wait_time.as_micros()).unwrap();
|
||||
self.wait_time.inc_by(val);
|
||||
self.count_throttled.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) mod disk_usage_based_eviction {
|
||||
@@ -3878,6 +3885,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
|
||||
|
||||
// histograms
|
||||
[
|
||||
&READ_NUM_LAYERS_VISITED,
|
||||
&VEC_READ_NUM_LAYERS_VISITED,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
|
||||
@@ -17,7 +17,7 @@ use pageserver_api::models::{
|
||||
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
|
||||
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
|
||||
PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
|
||||
PagestreamProtocolVersion, PagestreamRequest,
|
||||
PagestreamProtocolVersion,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_backend::{
|
||||
@@ -67,7 +67,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::{basebackup, timed_after_cancellation};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
@@ -537,29 +537,6 @@ impl From<WaitLsnError> for QueryError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
struct BatchedPageStreamError {
|
||||
req: PagestreamRequest,
|
||||
err: PageStreamError,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BatchedPageStreamError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.err.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
struct BatchedGetPageRequest {
|
||||
req: PagestreamGetPageRequest,
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
struct BatchedTestRequest {
|
||||
req: models::PagestreamTestRequest,
|
||||
timer: SmgrOpTimer,
|
||||
}
|
||||
|
||||
enum BatchedFeMessage {
|
||||
Exists {
|
||||
span: Span,
|
||||
@@ -577,7 +554,7 @@ enum BatchedFeMessage {
|
||||
span: Span,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
effective_request_lsn: Lsn,
|
||||
pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
|
||||
},
|
||||
DbSize {
|
||||
span: Span,
|
||||
@@ -591,40 +568,47 @@ enum BatchedFeMessage {
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
req: models::PagestreamGetSlruSegmentRequest,
|
||||
},
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
span: Span,
|
||||
shard: timeline::handle::Handle<TenantManagerTypes>,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
},
|
||||
RespondError {
|
||||
span: Span,
|
||||
error: BatchedPageStreamError,
|
||||
error: PageStreamError,
|
||||
},
|
||||
}
|
||||
|
||||
impl BatchedFeMessage {
|
||||
fn observe_execution_start(&mut self, at: Instant) {
|
||||
match self {
|
||||
BatchedFeMessage::Exists { timer, .. }
|
||||
| BatchedFeMessage::Nblocks { timer, .. }
|
||||
| BatchedFeMessage::DbSize { timer, .. }
|
||||
| BatchedFeMessage::GetSlruSegment { timer, .. } => {
|
||||
timer.observe_execution_start(at);
|
||||
async fn throttle_and_record_start_processing(
|
||||
&mut self,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), QueryError> {
|
||||
let (shard, tokens, timers) = match self {
|
||||
BatchedFeMessage::Exists { shard, timer, .. }
|
||||
| BatchedFeMessage::Nblocks { shard, timer, .. }
|
||||
| BatchedFeMessage::DbSize { shard, timer, .. }
|
||||
| BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
|
||||
(
|
||||
shard,
|
||||
// 1 token is probably under-estimating because these
|
||||
// request handlers typically do several Timeline::get calls.
|
||||
1,
|
||||
itertools::Either::Left(std::iter::once(timer)),
|
||||
)
|
||||
}
|
||||
BatchedFeMessage::GetPage { pages, .. } => {
|
||||
for page in pages {
|
||||
page.timer.observe_execution_start(at);
|
||||
}
|
||||
BatchedFeMessage::GetPage { shard, pages, .. } => (
|
||||
shard,
|
||||
pages.len(),
|
||||
itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
|
||||
),
|
||||
BatchedFeMessage::RespondError { .. } => return Ok(()),
|
||||
};
|
||||
let throttled = tokio::select! {
|
||||
throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
|
||||
_ = cancel.cancelled() => {
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test { requests, .. } => {
|
||||
for req in requests {
|
||||
req.timer.observe_execution_start(at);
|
||||
}
|
||||
}
|
||||
BatchedFeMessage::RespondError { .. } => {}
|
||||
};
|
||||
for timer in timers {
|
||||
timer.observe_throttle_done_execution_starting(&throttled);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -670,7 +654,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn pagestream_read_message<IO>(
|
||||
pgb: &mut PostgresBackendReader<IO>,
|
||||
tenant_id: TenantId,
|
||||
@@ -678,7 +661,6 @@ impl PageServerHandler {
|
||||
timeline_handles: &mut TimelineHandles,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
parent_span: Span,
|
||||
) -> Result<Option<BatchedFeMessage>, QueryError>
|
||||
where
|
||||
@@ -713,42 +695,18 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// parse request
|
||||
let neon_fe_msg =
|
||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||
|
||||
// TODO: turn in to async closure once available to avoid repeating received_at
|
||||
async fn record_op_start_and_throttle(
|
||||
shard: &timeline::handle::Handle<TenantManagerTypes>,
|
||||
op: metrics::SmgrQueryType,
|
||||
received_at: Instant,
|
||||
) -> Result<SmgrOpTimer, QueryError> {
|
||||
// It's important to start the smgr op metric recorder as early as possible
|
||||
// so that the _started counters are incremented before we do
|
||||
// any serious waiting, e.g., for throttle, batching, or actual request handling.
|
||||
let mut timer = shard.query_metrics.start_smgr_op(op, received_at);
|
||||
let now = Instant::now();
|
||||
timer.observe_throttle_start(now);
|
||||
let throttled = tokio::select! {
|
||||
res = shard.pagestream_throttle.throttle(1, now) => res,
|
||||
_ = shard.cancel.cancelled() => return Err(QueryError::Shutdown),
|
||||
};
|
||||
timer.observe_throttle_done(throttled);
|
||||
Ok(timer)
|
||||
}
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
|
||||
let batched_msg = match neon_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelExists,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
|
||||
BatchedFeMessage::Exists {
|
||||
span,
|
||||
timer,
|
||||
@@ -757,17 +715,14 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetRelSize,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
|
||||
BatchedFeMessage::Nblocks {
|
||||
span,
|
||||
timer,
|
||||
@@ -776,17 +731,14 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::DbSize(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetDbSize,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
|
||||
BatchedFeMessage::DbSize {
|
||||
span,
|
||||
timer,
|
||||
@@ -795,17 +747,14 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::GetSlruSegment(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetSlruSegment,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
|
||||
BatchedFeMessage::GetSlruSegment {
|
||||
span,
|
||||
timer,
|
||||
@@ -813,23 +762,25 @@ impl PageServerHandler {
|
||||
req,
|
||||
}
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
|
||||
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
rel,
|
||||
blkno,
|
||||
}) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn);
|
||||
|
||||
macro_rules! respond_error {
|
||||
($error:expr) => {{
|
||||
let error = BatchedFeMessage::RespondError {
|
||||
span,
|
||||
error: BatchedPageStreamError {
|
||||
req: req.hdr,
|
||||
err: $error,
|
||||
},
|
||||
error: $error,
|
||||
};
|
||||
Ok(Some(error))
|
||||
}};
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
let key = rel_block_to_key(rel, blkno);
|
||||
let shard = match timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Page(key))
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
@@ -854,17 +805,17 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetPageAtLsn,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
// It's important to start the timer before waiting for the LSN
|
||||
// so that the _started counters are incremented before we do
|
||||
// any serious waiting, e.g., for LSNs.
|
||||
let timer = shard
|
||||
.query_metrics
|
||||
.start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
|
||||
|
||||
let effective_request_lsn = match Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
request_lsn,
|
||||
not_modified_since,
|
||||
&shard.get_latest_gc_cutoff_lsn(),
|
||||
ctx,
|
||||
)
|
||||
@@ -880,23 +831,7 @@ impl PageServerHandler {
|
||||
span,
|
||||
shard,
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
PagestreamFeMessage::Test(req) => {
|
||||
let span = tracing::info_span!(parent: parent_span, "handle_test_request");
|
||||
let shard = timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.instrument(span.clone()) // sets `shard_id` field
|
||||
.await?;
|
||||
let timer =
|
||||
record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
|
||||
.await?;
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard,
|
||||
requests: vec![BatchedTestRequest { req, timer }],
|
||||
pages: smallvec::smallvec![(rel, blkno, timer)],
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -961,46 +896,6 @@ impl PageServerHandler {
|
||||
accum_pages.extend(this_pages);
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
(
|
||||
Ok(BatchedFeMessage::Test {
|
||||
shard: accum_shard,
|
||||
requests: accum_requests,
|
||||
..
|
||||
}),
|
||||
BatchedFeMessage::Test {
|
||||
shard: this_shard,
|
||||
requests: this_requests,
|
||||
..
|
||||
},
|
||||
) if (|| {
|
||||
assert!(this_requests.len() == 1);
|
||||
if accum_requests.len() >= max_batch_size.get() {
|
||||
trace!(%max_batch_size, "stopping batching because of batch size");
|
||||
assert_eq!(accum_requests.len(), max_batch_size.get());
|
||||
return false;
|
||||
}
|
||||
if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
|
||||
!= (this_shard.tenant_shard_id, this_shard.timeline_id)
|
||||
{
|
||||
trace!("stopping batching because timeline object mismatch");
|
||||
// TODO: we _could_ batch & execute each shard seperately (and in parallel).
|
||||
// But the current logic for keeping responses in order does not support that.
|
||||
return false;
|
||||
}
|
||||
let this_batch_key = this_requests[0].req.batch_key;
|
||||
let accum_batch_key = accum_requests[0].req.batch_key;
|
||||
if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
|
||||
trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})() =>
|
||||
{
|
||||
// ok to batch
|
||||
accum_requests.extend(this_requests);
|
||||
Ok(())
|
||||
}
|
||||
// something batched already but this message is unbatchable
|
||||
(_, this_msg) => {
|
||||
// by default, don't continue batching
|
||||
@@ -1015,22 +910,14 @@ impl PageServerHandler {
|
||||
pgb_writer: &mut PostgresBackend<IO>,
|
||||
batch: BatchedFeMessage,
|
||||
cancel: &CancellationToken,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
let started_at = Instant::now();
|
||||
let batch = {
|
||||
let mut batch = batch;
|
||||
batch.observe_execution_start(started_at);
|
||||
batch
|
||||
};
|
||||
|
||||
// invoke handler function
|
||||
let (handler_results, span): (
|
||||
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
|
||||
Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
|
||||
_,
|
||||
) = match batch {
|
||||
BatchedFeMessage::Exists {
|
||||
@@ -1045,8 +932,7 @@ impl PageServerHandler {
|
||||
.handle_get_rel_exists_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
)
|
||||
}
|
||||
@@ -1062,8 +948,7 @@ impl PageServerHandler {
|
||||
.handle_get_nblocks_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
)
|
||||
}
|
||||
@@ -1105,8 +990,7 @@ impl PageServerHandler {
|
||||
.handle_db_size_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
)
|
||||
}
|
||||
@@ -1122,29 +1006,7 @@ impl PageServerHandler {
|
||||
.handle_get_slru_segment_request(&shard, &req, ctx)
|
||||
.instrument(span.clone())
|
||||
.await
|
||||
.map(|msg| (msg, timer))
|
||||
.map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
|
||||
span,
|
||||
)
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
BatchedFeMessage::Test {
|
||||
span,
|
||||
shard,
|
||||
requests,
|
||||
} => {
|
||||
fail::fail_point!("ps::handle-pagerequest-message::test");
|
||||
(
|
||||
{
|
||||
let npages = requests.len();
|
||||
trace!(npages, "handling getpage request");
|
||||
let res = self
|
||||
.handle_test_request_batch(&shard, requests, ctx)
|
||||
.instrument(span.clone())
|
||||
.await;
|
||||
assert_eq!(res.len(), npages);
|
||||
res
|
||||
},
|
||||
.map(|msg| (msg, timer))],
|
||||
span,
|
||||
)
|
||||
}
|
||||
@@ -1160,7 +1022,7 @@ impl PageServerHandler {
|
||||
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
|
||||
for handler_result in handler_results {
|
||||
let (response_msg, timer) = match handler_result {
|
||||
Err(e) => match &e.err {
|
||||
Err(e) => match &e {
|
||||
PageStreamError::Shutdown => {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
@@ -1179,14 +1041,13 @@ impl PageServerHandler {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
let full = utils::error::report_compact_sources(&e.err);
|
||||
let full = utils::error::report_compact_sources(&e);
|
||||
span.in_scope(|| {
|
||||
error!("error reading relation or page version: {full:#}")
|
||||
});
|
||||
(
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
req: e.req,
|
||||
message: e.err.to_string(),
|
||||
message: e.to_string(),
|
||||
}),
|
||||
None, // TODO: measure errors
|
||||
)
|
||||
@@ -1199,9 +1060,7 @@ impl PageServerHandler {
|
||||
// marshal & transmit response message
|
||||
//
|
||||
|
||||
pgb_writer.write_message_noflush(&BeMessage::CopyData(
|
||||
&response_msg.serialize(protocol_version),
|
||||
))?;
|
||||
pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
|
||||
|
||||
// We purposefully don't count flush time into the timer.
|
||||
//
|
||||
@@ -1215,11 +1074,8 @@ impl PageServerHandler {
|
||||
// The timer's underlying metric is used for a storage-internal latency SLO and
|
||||
// we don't want to include latency in it that we can't control.
|
||||
// And as pointed out above, in this case, we don't control the time that flush will take.
|
||||
let flushing_timer = timer.map(|mut timer| {
|
||||
timer
|
||||
.observe_execution_end_flush_start(Instant::now())
|
||||
.expect("we are the first caller")
|
||||
});
|
||||
let flushing_timer =
|
||||
timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
|
||||
|
||||
// what we want to do
|
||||
let flush_fut = pgb_writer.flush();
|
||||
@@ -1267,7 +1123,7 @@ impl PageServerHandler {
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
_protocol_version: PagestreamProtocolVersion,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
@@ -1307,7 +1163,6 @@ impl PageServerHandler {
|
||||
timeline_handles,
|
||||
request_span,
|
||||
pipelining_config,
|
||||
protocol_version,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1320,7 +1175,6 @@ impl PageServerHandler {
|
||||
timeline_id,
|
||||
timeline_handles,
|
||||
request_span,
|
||||
protocol_version,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1347,7 +1201,6 @@ impl PageServerHandler {
|
||||
timeline_id: TimelineId,
|
||||
mut timeline_handles: TimelineHandles,
|
||||
request_span: Span,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
ctx: &RequestContext,
|
||||
) -> (
|
||||
(PostgresBackendReader<IO>, TimelineHandles),
|
||||
@@ -1365,7 +1218,6 @@ impl PageServerHandler {
|
||||
&mut timeline_handles,
|
||||
&cancel,
|
||||
ctx,
|
||||
protocol_version,
|
||||
request_span.clone(),
|
||||
)
|
||||
.await;
|
||||
@@ -1373,7 +1225,7 @@ impl PageServerHandler {
|
||||
Ok(msg) => msg,
|
||||
Err(e) => break e,
|
||||
};
|
||||
let msg = match msg {
|
||||
let mut msg = match msg {
|
||||
Some(msg) => msg,
|
||||
None => {
|
||||
debug!("pagestream subprotocol end observed");
|
||||
@@ -1381,8 +1233,12 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
|
||||
break cancelled;
|
||||
}
|
||||
|
||||
let err = self
|
||||
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
|
||||
.pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
|
||||
.await;
|
||||
match err {
|
||||
Ok(()) => {}
|
||||
@@ -1405,7 +1261,6 @@ impl PageServerHandler {
|
||||
mut timeline_handles: TimelineHandles,
|
||||
request_span: Span,
|
||||
pipelining_config: PageServicePipeliningConfigPipelined,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
ctx: &RequestContext,
|
||||
) -> (
|
||||
(PostgresBackendReader<IO>, TimelineHandles),
|
||||
@@ -1503,7 +1358,6 @@ impl PageServerHandler {
|
||||
&mut timeline_handles,
|
||||
&cancel_batcher,
|
||||
&ctx,
|
||||
protocol_version,
|
||||
request_span.clone(),
|
||||
)
|
||||
.await;
|
||||
@@ -1540,20 +1394,17 @@ impl PageServerHandler {
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let batch = match batch {
|
||||
let mut batch = match batch {
|
||||
Ok(batch) => batch,
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
self.pagesteam_handle_batched_message(
|
||||
pgb_writer,
|
||||
batch,
|
||||
&cancel,
|
||||
protocol_version,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
batch
|
||||
.throttle_and_record_start_processing(&self.cancel)
|
||||
.await?;
|
||||
self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -1727,8 +1578,8 @@ impl PageServerHandler {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
req.request_lsn,
|
||||
req.not_modified_since,
|
||||
&latest_gc_cutoff_lsn,
|
||||
ctx,
|
||||
)
|
||||
@@ -1739,7 +1590,6 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
req: *req,
|
||||
exists,
|
||||
}))
|
||||
}
|
||||
@@ -1754,8 +1604,8 @@ impl PageServerHandler {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
req.request_lsn,
|
||||
req.not_modified_since,
|
||||
&latest_gc_cutoff_lsn,
|
||||
ctx,
|
||||
)
|
||||
@@ -1766,7 +1616,6 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
req: *req,
|
||||
n_blocks,
|
||||
}))
|
||||
}
|
||||
@@ -1781,8 +1630,8 @@ impl PageServerHandler {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
req.request_lsn,
|
||||
req.not_modified_since,
|
||||
&latest_gc_cutoff_lsn,
|
||||
ctx,
|
||||
)
|
||||
@@ -1794,7 +1643,6 @@ impl PageServerHandler {
|
||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||
|
||||
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
||||
req: *req,
|
||||
db_size,
|
||||
}))
|
||||
}
|
||||
@@ -1804,9 +1652,9 @@ impl PageServerHandler {
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
effective_lsn: Lsn,
|
||||
requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
|
||||
requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
timeline
|
||||
@@ -1815,7 +1663,7 @@ impl PageServerHandler {
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
|
||||
requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
|
||||
effective_lsn,
|
||||
ctx,
|
||||
)
|
||||
@@ -1827,20 +1675,16 @@ impl PageServerHandler {
|
||||
requests
|
||||
.into_iter()
|
||||
.zip(results.into_iter())
|
||||
.map(|(req, res)| {
|
||||
.map(|((_, _, timer), res)| {
|
||||
res.map(|page| {
|
||||
(
|
||||
PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
|
||||
req: req.req,
|
||||
page,
|
||||
}),
|
||||
req.timer,
|
||||
timer,
|
||||
)
|
||||
})
|
||||
.map_err(|e| BatchedPageStreamError {
|
||||
err: PageStreamError::from(e),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
.map_err(PageStreamError::from)
|
||||
}),
|
||||
)
|
||||
}
|
||||
@@ -1855,8 +1699,8 @@ impl PageServerHandler {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(
|
||||
timeline,
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
req.request_lsn,
|
||||
req.not_modified_since,
|
||||
&latest_gc_cutoff_lsn,
|
||||
ctx,
|
||||
)
|
||||
@@ -1867,55 +1711,10 @@ impl PageServerHandler {
|
||||
let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetSlruSegment(
|
||||
PagestreamGetSlruSegmentResponse { req: *req, segment },
|
||||
PagestreamGetSlruSegmentResponse { segment },
|
||||
))
|
||||
}
|
||||
|
||||
// NB: this impl mimics what we do for batched getpage requests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_test_request_batch(
|
||||
&mut self,
|
||||
timeline: &Timeline,
|
||||
requests: Vec<BatchedTestRequest>,
|
||||
_ctx: &RequestContext,
|
||||
) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
|
||||
// real requests would do something with the timeline
|
||||
let mut results = Vec::with_capacity(requests.len());
|
||||
for _req in requests.iter() {
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
results.push({
|
||||
if timeline.cancel.is_cancelled() {
|
||||
Err(PageReconstructError::Cancelled)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: avoid creating the new Vec here
|
||||
Vec::from_iter(
|
||||
requests
|
||||
.into_iter()
|
||||
.zip(results.into_iter())
|
||||
.map(|(req, res)| {
|
||||
res.map(|()| {
|
||||
(
|
||||
PagestreamBeMessage::Test(models::PagestreamTestResponse {
|
||||
req: req.req.clone(),
|
||||
}),
|
||||
req.timer,
|
||||
)
|
||||
})
|
||||
.map_err(|e| BatchedPageStreamError {
|
||||
err: PageStreamError::from(e),
|
||||
req: req.req.hdr,
|
||||
})
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
/// Note on "fullbackup":
|
||||
/// Full basebackups should only be used for debugging purposes.
|
||||
/// Originally, it was introduced to enable breaking storage format changes,
|
||||
@@ -2107,7 +1906,6 @@ struct FullBackupCmd {
|
||||
struct PageStreamCmd {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
}
|
||||
|
||||
/// `lease lsn tenant timeline lsn`
|
||||
@@ -2128,7 +1926,7 @@ enum PageServiceCmd {
|
||||
}
|
||||
|
||||
impl PageStreamCmd {
|
||||
fn parse(query: &str, protocol_version: PagestreamProtocolVersion) -> anyhow::Result<Self> {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() != 2 {
|
||||
bail!(
|
||||
@@ -2143,7 +1941,6 @@ impl PageStreamCmd {
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
protocol_version,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -2281,14 +2078,7 @@ impl PageServiceCmd {
|
||||
bail!("cannot parse query: {query}")
|
||||
};
|
||||
match cmd.to_ascii_lowercase().as_str() {
|
||||
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(
|
||||
other,
|
||||
PagestreamProtocolVersion::V2,
|
||||
)?)),
|
||||
"pagestream_v3" => Ok(Self::PageStream(PageStreamCmd::parse(
|
||||
other,
|
||||
PagestreamProtocolVersion::V3,
|
||||
)?)),
|
||||
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
|
||||
"basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
|
||||
"fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
|
||||
"lease" => {
|
||||
@@ -2370,21 +2160,25 @@ where
|
||||
PageServiceCmd::PageStream(PageStreamCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
protocol_version,
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let command_kind = match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
|
||||
PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
|
||||
};
|
||||
COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
|
||||
|
||||
self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
|
||||
.await?;
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStreamV2)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
PagestreamProtocolVersion::V2,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
@@ -2563,8 +2357,7 @@ mod tests {
|
||||
cmd,
|
||||
PageServiceCmd::PageStream(PageStreamCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
protocol_version: PagestreamProtocolVersion::V2,
|
||||
timeline_id
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
|
||||
|
||||
@@ -627,7 +627,7 @@ impl Timeline {
|
||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||
let mid = (high + low) / 2;
|
||||
|
||||
let cmp = match self
|
||||
let cmp = self
|
||||
.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
@@ -635,16 +635,7 @@ impl Timeline {
|
||||
&mut found_larger,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(res) => res,
|
||||
Err(PageReconstructError::MissingKey(e)) => {
|
||||
warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e);
|
||||
// Return that we didn't find any requests smaller than the LSN, and logging the error.
|
||||
return Ok(LsnForTimestamp::Past(min_lsn));
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
.await?;
|
||||
|
||||
if cmp {
|
||||
high = mid;
|
||||
@@ -652,7 +643,6 @@ impl Timeline {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
|
||||
// so the LSN of the last commit record before or at `search_timestamp`.
|
||||
// Remove one from `low` to get `t`.
|
||||
|
||||
@@ -21,7 +21,6 @@ use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::CompactInfoResponse;
|
||||
use pageserver_api::models::LsnLease;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::models::TimelineState;
|
||||
@@ -38,17 +37,20 @@ use remote_timeline_client::manifest::{
|
||||
};
|
||||
use remote_timeline_client::UploadQueueNotReadyError;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::VecDeque;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use timeline::compaction::GcCompactionQueue;
|
||||
use timeline::compaction::GcCompactJob;
|
||||
use timeline::compaction::ScheduledCompactionTask;
|
||||
use timeline::import_pgdata;
|
||||
use timeline::offload::offload_timeline;
|
||||
use timeline::offload::OffloadError;
|
||||
use timeline::CompactFlags;
|
||||
use timeline::CompactOptions;
|
||||
use timeline::CompactionError;
|
||||
use timeline::ShutdownMode;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::sync::watch;
|
||||
@@ -344,8 +346,10 @@ pub struct Tenant {
|
||||
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
|
||||
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
|
||||
|
||||
/// Scheduled gc-compaction tasks.
|
||||
scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
|
||||
/// Scheduled compaction tasks. Currently, this can only be populated by triggering
|
||||
/// a manual gc-compaction from the manual compaction API.
|
||||
scheduled_compaction_tasks:
|
||||
std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
|
||||
|
||||
/// If the tenant is in Activating state, notify this to encourage it
|
||||
/// to proceed to Active as soon as possible, rather than waiting for lazy
|
||||
@@ -365,9 +369,8 @@ pub struct Tenant {
|
||||
|
||||
/// Throttle applied at the top of [`Timeline::get`].
|
||||
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
|
||||
pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
|
||||
|
||||
pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub(crate) pagestream_throttle:
|
||||
Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
|
||||
/// An ongoing timeline detach concurrency limiter.
|
||||
///
|
||||
@@ -1688,7 +1691,6 @@ impl Tenant {
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
},
|
||||
LoadTimelineCause::Attach,
|
||||
@@ -2037,7 +2039,7 @@ impl Tenant {
|
||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||
info!("unoffloading timeline");
|
||||
|
||||
// We activate the timeline below manually, so this must be called on an active tenant.
|
||||
// We activate the timeline below manually, so this must be called on an active timeline.
|
||||
// We expect callers of this function to ensure this.
|
||||
match self.current_state() {
|
||||
TenantState::Activating { .. }
|
||||
@@ -2994,35 +2996,113 @@ impl Tenant {
|
||||
if has_pending_l0_compaction_task {
|
||||
Some(true)
|
||||
} else {
|
||||
let queue = {
|
||||
let guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
guard.get(timeline_id).cloned()
|
||||
let mut has_pending_scheduled_compaction_task;
|
||||
let next_scheduled_compaction_task = {
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
|
||||
if !tline_pending_tasks.is_empty() {
|
||||
info!(
|
||||
"{} tasks left in the compaction schedule queue",
|
||||
tline_pending_tasks.len()
|
||||
);
|
||||
}
|
||||
let next_task = tline_pending_tasks.pop_front();
|
||||
has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
|
||||
next_task
|
||||
} else {
|
||||
has_pending_scheduled_compaction_task = false;
|
||||
None
|
||||
}
|
||||
};
|
||||
if let Some(queue) = queue {
|
||||
let has_pending_tasks = queue
|
||||
.iteration(cancel, ctx, &self.gc_block, timeline)
|
||||
.await?;
|
||||
Some(has_pending_tasks)
|
||||
} else {
|
||||
Some(false)
|
||||
if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
|
||||
{
|
||||
if !next_scheduled_compaction_task
|
||||
.options
|
||||
.flags
|
||||
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
|
||||
{
|
||||
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
|
||||
} else if next_scheduled_compaction_task.options.sub_compaction {
|
||||
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
|
||||
let jobs: Vec<GcCompactJob> = timeline
|
||||
.gc_compaction_split_jobs(
|
||||
GcCompactJob::from_compact_options(
|
||||
next_scheduled_compaction_task.options.clone(),
|
||||
),
|
||||
next_scheduled_compaction_task
|
||||
.options
|
||||
.sub_compaction_max_job_size_mb,
|
||||
)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
if jobs.is_empty() {
|
||||
info!("no jobs to run, skipping scheduled compaction task");
|
||||
} else {
|
||||
has_pending_scheduled_compaction_task = true;
|
||||
let jobs_len = jobs.len();
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
let tline_pending_tasks = guard.entry(*timeline_id).or_default();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
// Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
|
||||
// until we do further refactors to allow directly call `compact_with_gc`.
|
||||
let mut flags: EnumSet<CompactFlags> = EnumSet::default();
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
if job.dry_run {
|
||||
flags |= CompactFlags::DryRun;
|
||||
}
|
||||
let options = CompactOptions {
|
||||
flags,
|
||||
sub_compaction: false,
|
||||
compact_key_range: Some(job.compact_key_range.into()),
|
||||
compact_lsn_range: Some(job.compact_lsn_range.into()),
|
||||
sub_compaction_max_job_size_mb: None,
|
||||
};
|
||||
tline_pending_tasks.push_back(if idx == jobs_len - 1 {
|
||||
ScheduledCompactionTask {
|
||||
options,
|
||||
// The last job in the queue sends the signal and releases the gc guard
|
||||
result_tx: next_scheduled_compaction_task
|
||||
.result_tx
|
||||
.take(),
|
||||
gc_block: next_scheduled_compaction_task
|
||||
.gc_block
|
||||
.take(),
|
||||
}
|
||||
} else {
|
||||
ScheduledCompactionTask {
|
||||
options,
|
||||
result_tx: None,
|
||||
gc_block: None,
|
||||
}
|
||||
});
|
||||
}
|
||||
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
|
||||
}
|
||||
} else {
|
||||
let _ = timeline
|
||||
.compact_with_options(
|
||||
cancel,
|
||||
next_scheduled_compaction_task.options,
|
||||
ctx,
|
||||
)
|
||||
.instrument(info_span!("scheduled_compact_timeline", %timeline_id))
|
||||
.await?;
|
||||
if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
|
||||
// TODO: we can send compaction statistics in the future
|
||||
tx.send(()).ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(has_pending_scheduled_compaction_task)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
has_pending_task |= pending_task_left.unwrap_or(false);
|
||||
if pending_task_left == Some(false) && *can_offload {
|
||||
pausable_failpoint!("before-timeline-auto-offload");
|
||||
match offload_timeline(self, timeline)
|
||||
offload_timeline(self, timeline)
|
||||
.instrument(info_span!("offload_timeline", %timeline_id))
|
||||
.await
|
||||
{
|
||||
Err(OffloadError::NotArchived) => {
|
||||
// Ignore this, we likely raced with unarchival
|
||||
Ok(())
|
||||
}
|
||||
other => other,
|
||||
}?;
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3035,32 +3115,34 @@ impl Tenant {
|
||||
}
|
||||
|
||||
/// Cancel scheduled compaction tasks
|
||||
pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
|
||||
pub(crate) fn cancel_scheduled_compaction(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
) -> Vec<ScheduledCompactionTask> {
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
if let Some(q) = guard.get_mut(&timeline_id) {
|
||||
q.cancel_scheduled();
|
||||
if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
|
||||
let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
|
||||
current_tline_pending_tasks.into_iter().collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_scheduled_compaction_tasks(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
) -> Vec<CompactInfoResponse> {
|
||||
let res = {
|
||||
let guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
guard.get(&timeline_id).map(|q| q.remaining_jobs())
|
||||
};
|
||||
let Some((running, remaining)) = res else {
|
||||
return Vec::new();
|
||||
};
|
||||
let mut result = Vec::new();
|
||||
if let Some((id, running)) = running {
|
||||
result.extend(running.into_compact_info_resp(id, true));
|
||||
}
|
||||
for (id, job) in remaining {
|
||||
result.extend(job.into_compact_info_resp(id, false));
|
||||
}
|
||||
result
|
||||
) -> Vec<CompactOptions> {
|
||||
use itertools::Itertools;
|
||||
let guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
guard
|
||||
.get(&timeline_id)
|
||||
.map(|tline_pending_tasks| {
|
||||
tline_pending_tasks
|
||||
.iter()
|
||||
.map(|x| x.options.clone())
|
||||
.collect_vec()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Schedule a compaction task for a timeline.
|
||||
@@ -3069,12 +3151,20 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
options: CompactOptions,
|
||||
) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
|
||||
let gc_guard = match self.gc_block.start().await {
|
||||
Ok(guard) => guard,
|
||||
Err(e) => {
|
||||
bail!("cannot run gc-compaction because gc is blocked: {}", e);
|
||||
}
|
||||
};
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
let q = guard
|
||||
.entry(timeline_id)
|
||||
.or_insert_with(|| Arc::new(GcCompactionQueue::new()));
|
||||
q.schedule_manual_compaction(options, Some(tx));
|
||||
let tline_pending_tasks = guard.entry(timeline_id).or_default();
|
||||
tline_pending_tasks.push_back(ScheduledCompactionTask {
|
||||
options,
|
||||
result_tx: Some(tx),
|
||||
gc_block: Some(gc_guard),
|
||||
});
|
||||
Ok(rx)
|
||||
}
|
||||
|
||||
@@ -3994,9 +4084,6 @@ impl Tenant {
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
|
||||
/// to ensure proper cleanup of background tasks and metrics.
|
||||
//
|
||||
// Allow too_many_arguments because a constructor's argument list naturally grows with the
|
||||
// number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -4105,10 +4192,8 @@ impl Tenant {
|
||||
gate: Gate::default(),
|
||||
pagestream_throttle: Arc::new(throttle::Throttle::new(
|
||||
Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
|
||||
crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
|
||||
)),
|
||||
pagestream_throttle_metrics: Arc::new(
|
||||
crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
|
||||
),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
gc_block: Default::default(),
|
||||
@@ -5015,7 +5100,6 @@ impl Tenant {
|
||||
TimelineResources {
|
||||
remote_client: self.build_timeline_remote_client(timeline_id),
|
||||
pagestream_throttle: self.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
}
|
||||
}
|
||||
@@ -5690,7 +5774,7 @@ mod tests {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::value::Value;
|
||||
@@ -7749,18 +7833,7 @@ mod tests {
|
||||
let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
|
||||
let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
|
||||
let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
|
||||
|
||||
let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
|
||||
let base_inherited_key_child =
|
||||
Key::from_hex("610000000033333333444444445500000001").unwrap();
|
||||
let base_inherited_key_nonexist =
|
||||
Key::from_hex("610000000033333333444444445500000002").unwrap();
|
||||
let base_inherited_key_overwrite =
|
||||
Key::from_hex("610000000033333333444444445500000003").unwrap();
|
||||
|
||||
assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
|
||||
assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
@@ -7769,18 +7842,7 @@ mod tests {
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
Vec::new(), // delta layers
|
||||
vec![(
|
||||
Lsn(0x20),
|
||||
vec![
|
||||
(base_inherited_key, test_img("metadata inherited key 1")),
|
||||
(
|
||||
base_inherited_key_overwrite,
|
||||
test_img("metadata key overwrite 1a"),
|
||||
),
|
||||
(base_key, test_img("metadata key 1")),
|
||||
(base_key_overwrite, test_img("metadata key overwrite 1b")),
|
||||
],
|
||||
)], // image layers
|
||||
vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
|
||||
Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
|
||||
)
|
||||
.await?;
|
||||
@@ -7794,18 +7856,7 @@ mod tests {
|
||||
Vec::new(), // delta layers
|
||||
vec![(
|
||||
Lsn(0x30),
|
||||
vec![
|
||||
(
|
||||
base_inherited_key_child,
|
||||
test_img("metadata inherited key 2"),
|
||||
),
|
||||
(
|
||||
base_inherited_key_overwrite,
|
||||
test_img("metadata key overwrite 2a"),
|
||||
),
|
||||
(base_key_child, test_img("metadata key 2")),
|
||||
(base_key_overwrite, test_img("metadata key overwrite 2b")),
|
||||
],
|
||||
vec![(base_key_child, test_img("metadata key 2"))],
|
||||
)], // image layers
|
||||
Lsn(0x30),
|
||||
)
|
||||
@@ -7827,26 +7878,6 @@ mod tests {
|
||||
get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key overwrite 1b"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
|
||||
Some(test_img("metadata inherited key 1"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key overwrite 1a"))
|
||||
);
|
||||
|
||||
// test vectored get on child timeline
|
||||
assert_eq!(
|
||||
@@ -7861,82 +7892,6 @@ mod tests {
|
||||
get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
|
||||
Some(test_img("metadata inherited key 1"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
|
||||
Some(test_img("metadata inherited key 2"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key overwrite 2b"))
|
||||
);
|
||||
assert_eq!(
|
||||
get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
|
||||
Some(test_img("metadata key overwrite 2a"))
|
||||
);
|
||||
|
||||
// test vectored scan on parent timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let res = tline
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
res.into_iter()
|
||||
.map(|(k, v)| (k, v.unwrap()))
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
(base_inherited_key, test_img("metadata inherited key 1")),
|
||||
(
|
||||
base_inherited_key_overwrite,
|
||||
test_img("metadata key overwrite 1a")
|
||||
),
|
||||
(base_key, test_img("metadata key 1")),
|
||||
(base_key_overwrite, test_img("metadata key overwrite 1b")),
|
||||
]
|
||||
);
|
||||
|
||||
// test vectored scan on child timeline
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let res = child
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(Key::metadata_key_range()),
|
||||
lsn,
|
||||
&mut reconstruct_state,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
res.into_iter()
|
||||
.map(|(k, v)| (k, v.unwrap()))
|
||||
.collect::<Vec<_>>(),
|
||||
vec![
|
||||
(base_inherited_key, test_img("metadata inherited key 1")),
|
||||
(
|
||||
base_inherited_key_child,
|
||||
test_img("metadata inherited key 2")
|
||||
),
|
||||
(
|
||||
base_inherited_key_overwrite,
|
||||
test_img("metadata key overwrite 2a")
|
||||
),
|
||||
(base_key_child, test_img("metadata key 2")),
|
||||
(base_key_overwrite, test_img("metadata key overwrite 2b")),
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, TenantConfigPatch};
|
||||
use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::de::IntoDeserializer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
.map(humantime),
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
lsn_lease_length: value.lsn_lease_length.map(humantime),
|
||||
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
|
||||
|
||||
@@ -84,17 +84,17 @@ impl Value {
|
||||
|
||||
fn to_u64(self) -> u64 {
|
||||
let b = &self.0;
|
||||
((b[0] as u64) << 32)
|
||||
| ((b[1] as u64) << 24)
|
||||
| ((b[2] as u64) << 16)
|
||||
| ((b[3] as u64) << 8)
|
||||
(b[0] as u64) << 32
|
||||
| (b[1] as u64) << 24
|
||||
| (b[2] as u64) << 16
|
||||
| (b[3] as u64) << 8
|
||||
| b[4] as u64
|
||||
}
|
||||
|
||||
fn to_blknum(self) -> u32 {
|
||||
let b = &self.0;
|
||||
assert!(b[0] == 0x80);
|
||||
((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
|
||||
(b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -320,6 +320,7 @@ impl TimelineMetadata {
|
||||
|
||||
// Checksums make it awkward to build a valid instance by hand. This helper
|
||||
// provides a TimelineMetadata with a valid checksum in its header.
|
||||
#[cfg(test)]
|
||||
pub fn example() -> Self {
|
||||
let instance = Self::new(
|
||||
"0/16960E8".parse::<Lsn>().unwrap(),
|
||||
|
||||
@@ -63,18 +63,22 @@
|
||||
//! The contract between client and its user is that the user is responsible of
|
||||
//! scheduling operations in an order that keeps the remote consistent as
|
||||
//! described above.
|
||||
//!
|
||||
//! From the user's perspective, the operations are executed sequentially.
|
||||
//! Internally, the client knows which operations can be performed in parallel,
|
||||
//! and which operations act like a "barrier" that require preceding operations
|
||||
//! to finish. The calling code just needs to call the schedule-functions in the
|
||||
//! correct order, and the client will parallelize the operations in a way that
|
||||
//! is safe. For more details, see `UploadOp::can_bypass`.
|
||||
//! is safe.
|
||||
//!
|
||||
//! The caller should be careful with deletion, though. They should not delete
|
||||
//! local files that have been scheduled for upload but not yet finished uploading.
|
||||
//! Otherwise the upload will fail. To wait for an upload to finish, use
|
||||
//! the 'wait_completion' function (more on that later.)
|
||||
//!
|
||||
//! All of this relies on the following invariants:
|
||||
//!
|
||||
//! - We rely on read-after write consistency in the remote storage.
|
||||
//! - Layer files are immutable.
|
||||
//! - Layer files are immutable
|
||||
//!
|
||||
//! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
|
||||
//! storage. Different tenants can be attached to different pageservers, but if the
|
||||
@@ -300,15 +304,6 @@ pub enum WaitCompletionError {
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
|
||||
pub struct UploadQueueNotReadyError;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ShutdownIfArchivedError {
|
||||
#[error(transparent)]
|
||||
NotInitialized(NotInitialized),
|
||||
#[error("timeline is not archived")]
|
||||
NotArchived,
|
||||
}
|
||||
|
||||
/// Behavioral modes that enable seamless live migration.
|
||||
///
|
||||
/// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
|
||||
@@ -425,16 +420,8 @@ impl RemoteTimelineClient {
|
||||
/// an index file upload, i.e., it's not empty.
|
||||
/// The given `index_part` must be the one on the remote.
|
||||
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
info!(
|
||||
"initialized upload queue from remote index with {} layer files",
|
||||
@@ -449,16 +436,8 @@ impl RemoteTimelineClient {
|
||||
&self,
|
||||
local_metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
// Set the maximum number of inprogress tasks to the remote storage concurrency. There's
|
||||
// certainly no point in starting more upload tasks than this.
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
|
||||
upload_queue.initialize_empty_remote(local_metadata)?;
|
||||
self.update_remote_physical_size_gauge(None);
|
||||
info!("initialized upload queue as empty");
|
||||
Ok(())
|
||||
@@ -474,15 +453,9 @@ impl RemoteTimelineClient {
|
||||
let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
|
||||
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
|
||||
))?;
|
||||
let inprogress_limit = self
|
||||
.conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.and_then(|r| r.concurrency_limit())
|
||||
.unwrap_or(0);
|
||||
|
||||
let mut upload_queue = self.upload_queue.lock().unwrap();
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
|
||||
upload_queue.initialize_with_current_remote_index_part(index_part)?;
|
||||
self.update_remote_physical_size_gauge(Some(index_part));
|
||||
self.stop_impl(&mut upload_queue);
|
||||
|
||||
@@ -843,55 +816,6 @@ impl RemoteTimelineClient {
|
||||
Ok(need_wait)
|
||||
}
|
||||
|
||||
/// Shuts the timeline client down, but only if the timeline is archived.
|
||||
///
|
||||
/// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the
|
||||
/// same lock to prevent races between unarchival and offloading: unarchival requires the
|
||||
/// upload queue to be initialized, and leaves behind an upload queue where either dirty
|
||||
/// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload
|
||||
/// queue.
|
||||
pub(crate) async fn shutdown_if_archived(
|
||||
self: &Arc<Self>,
|
||||
) -> Result<(), ShutdownIfArchivedError> {
|
||||
{
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard
|
||||
.initialized_mut()
|
||||
.map_err(ShutdownIfArchivedError::NotInitialized)?;
|
||||
|
||||
match (
|
||||
upload_queue.dirty.archived_at.is_none(),
|
||||
upload_queue.clean.0.archived_at.is_none(),
|
||||
) {
|
||||
// The expected case: the timeline is archived and we don't want to unarchive
|
||||
(false, false) => {}
|
||||
(true, false) => {
|
||||
tracing::info!("can't shut down timeline: timeline slated for unarchival");
|
||||
return Err(ShutdownIfArchivedError::NotArchived);
|
||||
}
|
||||
(dirty_archived, true) => {
|
||||
tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage");
|
||||
return Err(ShutdownIfArchivedError::NotArchived);
|
||||
}
|
||||
}
|
||||
|
||||
// Set the shutting_down flag while the guard from the archival check is held.
|
||||
// This prevents a race with unarchival, as initialized_mut will not return
|
||||
// an upload queue from this point.
|
||||
// Also launch the queued tasks like shutdown() does.
|
||||
if !upload_queue.shutting_down {
|
||||
upload_queue.shutting_down = true;
|
||||
upload_queue.queued_operations.push_back(UploadOp::Shutdown);
|
||||
// this operation is not counted similar to Barrier
|
||||
self.launch_queued_tasks(upload_queue);
|
||||
}
|
||||
}
|
||||
|
||||
self.shutdown().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
|
||||
pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
|
||||
self: &Arc<Self>,
|
||||
@@ -1873,17 +1797,57 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Pick next tasks from the queue, and start as many of them as possible without violating
|
||||
/// the ordering constraints.
|
||||
///
|
||||
/// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
|
||||
/// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
|
||||
/// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
|
||||
/// The caller needs to already hold the `upload_queue` lock.
|
||||
fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
|
||||
debug!("starting op: {next_op}");
|
||||
while let Some(next_op) = upload_queue.queued_operations.front() {
|
||||
// Can we run this task now?
|
||||
let can_run_now = match next_op {
|
||||
UploadOp::UploadLayer(..) => {
|
||||
// Can always be scheduled.
|
||||
true
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
// These can only be performed after all the preceding operations
|
||||
// have finished.
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
UploadOp::Delete(..) => {
|
||||
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
|
||||
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
|
||||
}
|
||||
|
||||
// Prepare upload.
|
||||
UploadOp::Barrier(_) | UploadOp::Shutdown => {
|
||||
upload_queue.inprogress_tasks.is_empty()
|
||||
}
|
||||
};
|
||||
|
||||
// If we cannot launch this task, don't look any further.
|
||||
//
|
||||
// In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
|
||||
// them now, but we don't try to do that currently. For example, if the frontmost task
|
||||
// is an index-file upload that cannot proceed until preceding uploads have finished, we
|
||||
// could still start layer uploads that were scheduled later.
|
||||
if !can_run_now {
|
||||
break;
|
||||
}
|
||||
|
||||
if let UploadOp::Shutdown = next_op {
|
||||
// leave the op in the queue but do not start more tasks; it will be dropped when
|
||||
// the stop is called.
|
||||
upload_queue.shutdown_ready.close();
|
||||
break;
|
||||
}
|
||||
|
||||
// We can launch this task. Remove it from the queue first.
|
||||
let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
|
||||
|
||||
debug!("starting op: {}", next_op);
|
||||
|
||||
// Update the counters and prepare
|
||||
match &mut next_op {
|
||||
UploadOp::UploadLayer(layer, meta, mode) => {
|
||||
if upload_queue
|
||||
@@ -1894,14 +1858,18 @@ impl RemoteTimelineClient {
|
||||
} else {
|
||||
*mode = Some(OpType::MayReorder)
|
||||
}
|
||||
upload_queue.num_inprogress_layer_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {
|
||||
upload_queue.num_inprogress_metadata_uploads += 1;
|
||||
}
|
||||
UploadOp::UploadMetadata { .. } => {}
|
||||
UploadOp::Delete(Delete { layers }) => {
|
||||
for (name, meta) in layers {
|
||||
upload_queue
|
||||
.recently_deleted
|
||||
.insert((name.clone(), meta.generation));
|
||||
}
|
||||
upload_queue.num_inprogress_deletions += 1;
|
||||
}
|
||||
UploadOp::Barrier(sender) => {
|
||||
sender.send_replace(());
|
||||
@@ -1918,7 +1886,6 @@ impl RemoteTimelineClient {
|
||||
let task = Arc::new(UploadTask {
|
||||
task_id: upload_task_id,
|
||||
op: next_op,
|
||||
coalesced_ops,
|
||||
retries: AtomicU32::new(0),
|
||||
});
|
||||
upload_queue
|
||||
@@ -2002,8 +1969,6 @@ impl RemoteTimelineClient {
|
||||
|
||||
let upload_result: anyhow::Result<()> = match &task.op {
|
||||
UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
|
||||
// TODO: check if this mechanism can be removed now that can_bypass() performs
|
||||
// conflict checks during scheduling.
|
||||
if let Some(OpType::FlushDeletion) = mode {
|
||||
if self.config.read().unwrap().block_deletions {
|
||||
// Of course, this is not efficient... but usually the queue should be empty.
|
||||
@@ -2226,8 +2191,13 @@ impl RemoteTimelineClient {
|
||||
upload_queue.inprogress_tasks.remove(&task.task_id);
|
||||
|
||||
let lsn_update = match task.op {
|
||||
UploadOp::UploadLayer(_, _, _) => None,
|
||||
UploadOp::UploadLayer(_, _, _) => {
|
||||
upload_queue.num_inprogress_layer_uploads -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::UploadMetadata { ref uploaded } => {
|
||||
upload_queue.num_inprogress_metadata_uploads -= 1;
|
||||
|
||||
// the task id is reused as a monotonicity check for storing the "clean"
|
||||
// IndexPart.
|
||||
let last_updater = upload_queue.clean.1;
|
||||
@@ -2261,7 +2231,10 @@ impl RemoteTimelineClient {
|
||||
None
|
||||
}
|
||||
}
|
||||
UploadOp::Delete(_) => None,
|
||||
UploadOp::Delete(_) => {
|
||||
upload_queue.num_inprogress_deletions -= 1;
|
||||
None
|
||||
}
|
||||
UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
|
||||
};
|
||||
|
||||
@@ -2286,9 +2259,6 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
self.metric_end(&task.op);
|
||||
for coalesced_op in &task.coalesced_ops {
|
||||
self.metric_end(coalesced_op);
|
||||
}
|
||||
}
|
||||
|
||||
fn metric_impl(
|
||||
@@ -2381,7 +2351,6 @@ impl RemoteTimelineClient {
|
||||
// but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
|
||||
// Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
|
||||
let upload_queue_for_deletion = UploadQueueInitialized {
|
||||
inprogress_limit: initialized.inprogress_limit,
|
||||
task_counter: 0,
|
||||
dirty: initialized.dirty.clone(),
|
||||
clean: initialized.clean.clone(),
|
||||
@@ -2389,6 +2358,9 @@ impl RemoteTimelineClient {
|
||||
visible_remote_consistent_lsn: initialized
|
||||
.visible_remote_consistent_lsn
|
||||
.clone(),
|
||||
num_inprogress_layer_uploads: 0,
|
||||
num_inprogress_metadata_uploads: 0,
|
||||
num_inprogress_deletions: 0,
|
||||
inprogress_tasks: HashMap::default(),
|
||||
queued_operations: VecDeque::default(),
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -2415,6 +2387,14 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
};
|
||||
|
||||
// consistency check
|
||||
assert_eq!(
|
||||
qi.num_inprogress_layer_uploads
|
||||
+ qi.num_inprogress_metadata_uploads
|
||||
+ qi.num_inprogress_deletions,
|
||||
qi.inprogress_tasks.len()
|
||||
);
|
||||
|
||||
// We don't need to do anything here for in-progress tasks. They will finish
|
||||
// on their own, decrement the unfinished-task counter themselves, and observe
|
||||
// that the queue is Stopped.
|
||||
@@ -2861,8 +2841,8 @@ mod tests {
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
assert!(upload_queue.queued_operations.is_empty());
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 2);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
|
||||
assert!(upload_queue.inprogress_tasks.len() == 2);
|
||||
assert!(upload_queue.num_inprogress_layer_uploads == 2);
|
||||
|
||||
// also check that `latest_file_changes` was updated
|
||||
assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
|
||||
@@ -2932,8 +2912,8 @@ mod tests {
|
||||
// Deletion schedules upload of the index file, and the file deletion itself
|
||||
assert_eq!(upload_queue.queued_operations.len(), 2);
|
||||
assert_eq!(upload_queue.inprogress_tasks.len(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions(), 0);
|
||||
assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
|
||||
assert_eq!(upload_queue.num_inprogress_deletions, 0);
|
||||
assert_eq!(
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
|
||||
0
|
||||
|
||||
@@ -104,7 +104,7 @@ impl IndexPart {
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
pub fn empty(metadata: TimelineMetadata) -> Self {
|
||||
pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
|
||||
IndexPart {
|
||||
version: Self::LATEST_VERSION,
|
||||
layer_metadata: Default::default(),
|
||||
|
||||
@@ -12,7 +12,7 @@ pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
|
||||
.keys
|
||||
.entry(*key)
|
||||
.or_insert(Ok(VectoredValueReconstructState::default()));
|
||||
let is_sparse_key = key.is_sparse();
|
||||
let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
|
||||
if let Ok(state) = state {
|
||||
let key_done = match state.situation {
|
||||
ValueReconstructSituation::Complete => {
|
||||
|
||||
@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
|
||||
///
|
||||
/// Layout:
|
||||
/// - 1 bit: `will_init`
|
||||
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
|
||||
/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
|
||||
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
|
||||
/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct IndexEntry(u64);
|
||||
|
||||
|
||||
@@ -1812,7 +1812,7 @@ enum LayerKind {
|
||||
|
||||
/// Guard for forcing a layer be resident while it exists.
|
||||
#[derive(Clone)]
|
||||
pub struct ResidentLayer {
|
||||
pub(crate) struct ResidentLayer {
|
||||
owner: Layer,
|
||||
downloaded: Arc<DownloadedLayer>,
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::Instant,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
@@ -16,8 +16,9 @@ use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
|
||||
/// To share a throttle among multiple entities, wrap it in an [`Arc`].
|
||||
///
|
||||
/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
|
||||
pub struct Throttle {
|
||||
pub struct Throttle<M: Metric> {
|
||||
inner: ArcSwap<Inner>,
|
||||
metric: M,
|
||||
/// will be turned into [`Stats::count_accounted_start`]
|
||||
count_accounted_start: AtomicU64,
|
||||
/// will be turned into [`Stats::count_accounted_finish`]
|
||||
@@ -35,6 +36,15 @@ pub struct Inner {
|
||||
|
||||
pub type Config = pageserver_api::models::ThrottleConfig;
|
||||
|
||||
pub struct Observation {
|
||||
pub wait_time: Duration,
|
||||
}
|
||||
pub trait Metric {
|
||||
fn accounting_start(&self);
|
||||
fn accounting_finish(&self);
|
||||
fn observe_throttling(&self, observation: &Observation);
|
||||
}
|
||||
|
||||
/// See [`Throttle::reset_stats`].
|
||||
pub struct Stats {
|
||||
/// Number of requests that started [`Throttle::throttle`] calls.
|
||||
@@ -49,14 +59,18 @@ pub struct Stats {
|
||||
}
|
||||
|
||||
pub enum ThrottleResult {
|
||||
NotThrottled { end: Instant },
|
||||
Throttled { end: Instant },
|
||||
NotThrottled { start: Instant },
|
||||
Throttled { start: Instant, end: Instant },
|
||||
}
|
||||
|
||||
impl Throttle {
|
||||
pub fn new(config: Config) -> Self {
|
||||
impl<M> Throttle<M>
|
||||
where
|
||||
M: Metric,
|
||||
{
|
||||
pub fn new(config: Config, metric: M) -> Self {
|
||||
Self {
|
||||
inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
|
||||
metric,
|
||||
count_accounted_start: AtomicU64::new(0),
|
||||
count_accounted_finish: AtomicU64::new(0),
|
||||
count_throttled: AtomicU64::new(0),
|
||||
@@ -113,27 +127,32 @@ impl Throttle {
|
||||
self.inner.load().rate_limiter.steady_rps()
|
||||
}
|
||||
|
||||
/// `start` must be [`Instant::now`] or earlier.
|
||||
pub async fn throttle(&self, key_count: usize, start: Instant) -> ThrottleResult {
|
||||
pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
|
||||
let inner = self.inner.load_full(); // clones the `Inner` Arc
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
if !inner.enabled {
|
||||
return ThrottleResult::NotThrottled { end: start };
|
||||
return ThrottleResult::NotThrottled { start };
|
||||
}
|
||||
|
||||
self.metric.accounting_start();
|
||||
self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
|
||||
let did_throttle = inner.rate_limiter.acquire(key_count).await;
|
||||
self.count_accounted_finish.fetch_add(1, Ordering::Relaxed);
|
||||
self.metric.accounting_finish();
|
||||
|
||||
if did_throttle {
|
||||
self.count_throttled.fetch_add(1, Ordering::Relaxed);
|
||||
let end = Instant::now();
|
||||
let wait_time = end - start;
|
||||
let now = Instant::now();
|
||||
let wait_time = now - start;
|
||||
self.sum_throttled_usecs
|
||||
.fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
|
||||
ThrottleResult::Throttled { end }
|
||||
let observation = Observation { wait_time };
|
||||
self.metric.observe_throttling(&observation);
|
||||
ThrottleResult::Throttled { start, end: now }
|
||||
} else {
|
||||
ThrottleResult::NotThrottled { end: start }
|
||||
ThrottleResult::NotThrottled { start }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
SPARSE_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
|
||||
/// The outward-facing resources required to build a Timeline
|
||||
pub struct TimelineResources {
|
||||
pub remote_client: RemoteTimelineClient,
|
||||
pub pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
|
||||
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub pagestream_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -412,7 +412,8 @@ pub struct Timeline {
|
||||
gc_lock: tokio::sync::Mutex<()>,
|
||||
|
||||
/// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
|
||||
pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,
|
||||
pub(crate) pagestream_throttle:
|
||||
Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
|
||||
|
||||
/// Size estimator for aux file v2
|
||||
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
|
||||
@@ -2309,7 +2310,6 @@ impl Timeline {
|
||||
query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
|
||||
&tenant_shard_id,
|
||||
&timeline_id,
|
||||
resources.pagestream_throttle_metrics,
|
||||
),
|
||||
|
||||
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
|
||||
@@ -3221,7 +3221,7 @@ impl Timeline {
|
||||
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
|
||||
// stalling compaction.
|
||||
keyspace.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
|
||||
ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
|
||||
});
|
||||
|
||||
// Keyspace is fully retrieved
|
||||
@@ -3242,11 +3242,7 @@ impl Timeline {
|
||||
// keys from `keyspace`, we expect there to be no overlap between it and the image covered key
|
||||
// space. If that's not the case, we had at least one key encounter a gap in the image layer
|
||||
// and stop the search as a result of that.
|
||||
let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
|
||||
// Do not fire missing key error for sparse keys.
|
||||
removed.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![SPARSE_RANGE],
|
||||
});
|
||||
let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
|
||||
if !removed.is_empty() {
|
||||
break Some(removed);
|
||||
}
|
||||
@@ -3261,21 +3257,6 @@ impl Timeline {
|
||||
timeline = &*timeline_owned;
|
||||
};
|
||||
|
||||
// Remove sparse keys from the keyspace so that it doesn't fire errors.
|
||||
let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
|
||||
let mut missing_keyspace = missing_keyspace;
|
||||
missing_keyspace.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![SPARSE_RANGE],
|
||||
});
|
||||
if missing_keyspace.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(missing_keyspace)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
if let Some(missing_keyspace) = missing_keyspace {
|
||||
return Err(GetVectoredError::MissingKey(MissingKeyError {
|
||||
key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
|
||||
@@ -3781,34 +3762,35 @@ impl Timeline {
|
||||
return Err(FlushLayerError::Cancelled);
|
||||
}
|
||||
|
||||
// Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
|
||||
// So that the key ranges don't overlap.
|
||||
let mut partitions = KeyPartitioning::default();
|
||||
partitions.parts.extend(rel_partition.parts);
|
||||
if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
|
||||
// every single key within the keyspace, and therefore, it's safe to force converting it
|
||||
// into a dense keyspace before calling this function.
|
||||
partitions
|
||||
.parts
|
||||
.extend(metadata_partition.into_dense().parts);
|
||||
}
|
||||
|
||||
let mut layers_to_upload = Vec::new();
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
&partitions,
|
||||
&rel_partition,
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
if !metadata_partition.parts.is_empty() {
|
||||
assert_eq!(
|
||||
metadata_partition.parts.len(),
|
||||
1,
|
||||
"currently sparse keyspace should only contain a single metadata keyspace"
|
||||
);
|
||||
layers_to_upload.extend(
|
||||
self.create_image_layers(
|
||||
// Safety: create_image_layers treat sparse keyspaces differently that it does not scan
|
||||
// every single key within the keyspace, and therefore, it's safe to force converting it
|
||||
// into a dense keyspace before calling this function.
|
||||
&metadata_partition.into_dense(),
|
||||
self.initdb_lsn,
|
||||
ImageLayerCreationMode::Initial,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
|
||||
(layers_to_upload, None)
|
||||
} else {
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//!
|
||||
//! The old legacy algorithm is implemented directly in `timeline.rs`.
|
||||
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -16,12 +16,10 @@ use super::{
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::KEY_SIZE;
|
||||
use pageserver_api::keyspace::ShardedRange;
|
||||
use pageserver_api::models::CompactInfoResponse;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -32,7 +30,6 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
|
||||
use crate::page_cache;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::gc_block::GcBlock;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::batch_split_writer::{
|
||||
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
|
||||
@@ -66,284 +63,16 @@ use super::CompactionError;
|
||||
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
|
||||
const COMPACTION_DELTA_THRESHOLD: usize = 5;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
pub struct GcCompactionJobId(pub usize);
|
||||
|
||||
impl std::fmt::Display for GcCompactionJobId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum GcCompactionQueueItem {
|
||||
Manual(CompactOptions),
|
||||
SubCompactionJob(CompactOptions),
|
||||
#[allow(dead_code)]
|
||||
UpdateL2Lsn(Lsn),
|
||||
Notify(GcCompactionJobId),
|
||||
}
|
||||
|
||||
impl GcCompactionQueueItem {
|
||||
pub fn into_compact_info_resp(
|
||||
self,
|
||||
id: GcCompactionJobId,
|
||||
running: bool,
|
||||
) -> Option<CompactInfoResponse> {
|
||||
match self {
|
||||
GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
|
||||
compact_key_range: options.compact_key_range,
|
||||
compact_lsn_range: options.compact_lsn_range,
|
||||
sub_compaction: options.sub_compaction,
|
||||
running,
|
||||
job_id: id.0,
|
||||
}),
|
||||
GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
|
||||
compact_key_range: options.compact_key_range,
|
||||
compact_lsn_range: options.compact_lsn_range,
|
||||
sub_compaction: options.sub_compaction,
|
||||
running,
|
||||
job_id: id.0,
|
||||
}),
|
||||
GcCompactionQueueItem::UpdateL2Lsn(_) => None,
|
||||
GcCompactionQueueItem::Notify(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct GcCompactionQueueInner {
|
||||
running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
|
||||
queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
|
||||
notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
|
||||
gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
|
||||
last_id: GcCompactionJobId,
|
||||
}
|
||||
|
||||
impl GcCompactionQueueInner {
|
||||
fn next_id(&mut self) -> GcCompactionJobId {
|
||||
let id = self.last_id;
|
||||
self.last_id = GcCompactionJobId(id.0 + 1);
|
||||
id
|
||||
}
|
||||
}
|
||||
|
||||
/// A structure to store gc_compaction jobs.
|
||||
pub struct GcCompactionQueue {
|
||||
/// All items in the queue, and the currently-running job.
|
||||
inner: std::sync::Mutex<GcCompactionQueueInner>,
|
||||
/// Ensure only one thread is consuming the queue.
|
||||
consumer_lock: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
impl GcCompactionQueue {
|
||||
pub fn new() -> Self {
|
||||
GcCompactionQueue {
|
||||
inner: std::sync::Mutex::new(GcCompactionQueueInner {
|
||||
running: None,
|
||||
queued: VecDeque::new(),
|
||||
notify: HashMap::new(),
|
||||
gc_guards: HashMap::new(),
|
||||
last_id: GcCompactionJobId(0),
|
||||
}),
|
||||
consumer_lock: tokio::sync::Mutex::new(()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cancel_scheduled(&self) {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.queued.clear();
|
||||
guard.notify.clear();
|
||||
guard.gc_guards.clear();
|
||||
}
|
||||
|
||||
/// Schedule a manual compaction job.
|
||||
pub fn schedule_manual_compaction(
|
||||
&self,
|
||||
options: CompactOptions,
|
||||
notify: Option<tokio::sync::oneshot::Sender<()>>,
|
||||
) -> GcCompactionJobId {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
let id = guard.next_id();
|
||||
guard
|
||||
.queued
|
||||
.push_back((id, GcCompactionQueueItem::Manual(options)));
|
||||
if let Some(notify) = notify {
|
||||
guard.notify.insert(id, notify);
|
||||
}
|
||||
info!("scheduled compaction job id={}", id);
|
||||
id
|
||||
}
|
||||
|
||||
/// Trigger an auto compaction.
|
||||
#[allow(dead_code)]
|
||||
pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
|
||||
|
||||
/// Notify the caller the job has finished and unblock GC.
|
||||
fn notify_and_unblock(&self, id: GcCompactionJobId) {
|
||||
info!("compaction job id={} finished", id);
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
if let Some(blocking) = guard.gc_guards.remove(&id) {
|
||||
drop(blocking)
|
||||
}
|
||||
if let Some(tx) = guard.notify.remove(&id) {
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_sub_compaction(
|
||||
&self,
|
||||
id: GcCompactionJobId,
|
||||
options: CompactOptions,
|
||||
timeline: &Arc<Timeline>,
|
||||
gc_block: &GcBlock,
|
||||
) -> Result<(), CompactionError> {
|
||||
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
|
||||
let jobs: Vec<GcCompactJob> = timeline
|
||||
.gc_compaction_split_jobs(
|
||||
GcCompactJob::from_compact_options(options.clone()),
|
||||
options.sub_compaction_max_job_size_mb,
|
||||
)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
if jobs.is_empty() {
|
||||
info!("no jobs to run, skipping scheduled compaction task");
|
||||
self.notify_and_unblock(id);
|
||||
} else {
|
||||
let gc_guard = match gc_block.start().await {
|
||||
Ok(guard) => guard,
|
||||
Err(e) => {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"cannot run gc-compaction because gc is blocked: {}",
|
||||
e
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
let jobs_len = jobs.len();
|
||||
let mut pending_tasks = Vec::new();
|
||||
for job in jobs {
|
||||
// Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
|
||||
// until we do further refactors to allow directly call `compact_with_gc`.
|
||||
let mut flags: EnumSet<CompactFlags> = EnumSet::default();
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
if job.dry_run {
|
||||
flags |= CompactFlags::DryRun;
|
||||
}
|
||||
let options = CompactOptions {
|
||||
flags,
|
||||
sub_compaction: false,
|
||||
compact_key_range: Some(job.compact_key_range.into()),
|
||||
compact_lsn_range: Some(job.compact_lsn_range.into()),
|
||||
sub_compaction_max_job_size_mb: None,
|
||||
};
|
||||
pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
|
||||
}
|
||||
pending_tasks.push(GcCompactionQueueItem::Notify(id));
|
||||
{
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.gc_guards.insert(id, gc_guard);
|
||||
let mut tasks = Vec::new();
|
||||
for task in pending_tasks {
|
||||
let id = guard.next_id();
|
||||
tasks.push((id, task));
|
||||
}
|
||||
tasks.reverse();
|
||||
for item in tasks {
|
||||
guard.queued.push_front(item);
|
||||
}
|
||||
}
|
||||
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Take a job from the queue and process it. Returns if there are still pending tasks.
|
||||
pub async fn iteration(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
gc_block: &GcBlock,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> Result<bool, CompactionError> {
|
||||
let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
|
||||
let has_pending_tasks;
|
||||
let (id, item) = {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
let Some((id, item)) = guard.queued.pop_front() else {
|
||||
return Ok(false);
|
||||
};
|
||||
guard.running = Some((id, item.clone()));
|
||||
has_pending_tasks = !guard.queued.is_empty();
|
||||
(id, item)
|
||||
};
|
||||
|
||||
match item {
|
||||
GcCompactionQueueItem::Manual(options) => {
|
||||
if !options
|
||||
.flags
|
||||
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
|
||||
{
|
||||
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
|
||||
} else if options.sub_compaction {
|
||||
self.handle_sub_compaction(id, options, timeline, gc_block)
|
||||
.await?;
|
||||
} else {
|
||||
let gc_guard = match gc_block.start().await {
|
||||
Ok(guard) => guard,
|
||||
Err(e) => {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"cannot run gc-compaction because gc is blocked: {}",
|
||||
e
|
||||
)));
|
||||
}
|
||||
};
|
||||
{
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.gc_guards.insert(id, gc_guard);
|
||||
}
|
||||
let _ = timeline
|
||||
.compact_with_options(cancel, options, ctx)
|
||||
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
|
||||
.await?;
|
||||
self.notify_and_unblock(id);
|
||||
}
|
||||
}
|
||||
GcCompactionQueueItem::SubCompactionJob(options) => {
|
||||
let _ = timeline
|
||||
.compact_with_options(cancel, options, ctx)
|
||||
.instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
|
||||
.await?;
|
||||
}
|
||||
GcCompactionQueueItem::Notify(id) => {
|
||||
self.notify_and_unblock(id);
|
||||
}
|
||||
GcCompactionQueueItem::UpdateL2Lsn(_) => {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
{
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.running = None;
|
||||
}
|
||||
Ok(has_pending_tasks)
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub fn remaining_jobs(
|
||||
&self,
|
||||
) -> (
|
||||
Option<(GcCompactionJobId, GcCompactionQueueItem)>,
|
||||
VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
|
||||
) {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
(guard.running.clone(), guard.queued.clone())
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn remaining_jobs_num(&self) -> usize {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
|
||||
}
|
||||
/// A scheduled compaction task.
|
||||
pub(crate) struct ScheduledCompactionTask {
|
||||
/// It's unfortunate that we need to store a compact options struct here because the only outer
|
||||
/// API we can call here is `compact_with_options` which does a few setup calls before starting the
|
||||
/// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
|
||||
pub options: CompactOptions,
|
||||
/// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
|
||||
pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
|
||||
/// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
|
||||
pub gc_block: Option<gc_block::Guard>,
|
||||
}
|
||||
|
||||
/// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
|
||||
|
||||
@@ -194,9 +194,7 @@ impl DeleteTimelineFlow {
|
||||
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let allow_offloaded_children = false;
|
||||
let set_stopping = true;
|
||||
let (timeline, mut guard) =
|
||||
Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
@@ -301,7 +299,6 @@ impl DeleteTimelineFlow {
|
||||
TimelineResources {
|
||||
remote_client,
|
||||
pagestream_throttle: tenant.pagestream_throttle.clone(),
|
||||
pagestream_throttle_metrics: tenant.pagestream_throttle_metrics.clone(),
|
||||
l0_flush_global_state: tenant.l0_flush_global_state.clone(),
|
||||
},
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
@@ -337,7 +334,6 @@ impl DeleteTimelineFlow {
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
allow_offloaded_children: bool,
|
||||
set_stopping: bool,
|
||||
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
|
||||
// Note the interaction between this guard and deletion guard.
|
||||
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
|
||||
@@ -393,10 +389,8 @@ impl DeleteTimelineFlow {
|
||||
}
|
||||
};
|
||||
|
||||
if set_stopping {
|
||||
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
}
|
||||
if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
}
|
||||
|
||||
Ok((timeline, delete_lock_guard))
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use pageserver_api::models::{TenantState, TimelineState};
|
||||
use pageserver_api::models::TenantState;
|
||||
|
||||
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
|
||||
use super::Timeline;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
|
||||
use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@@ -37,29 +36,28 @@ pub(crate) async fn offload_timeline(
|
||||
tracing::info!("offloading archived timeline");
|
||||
|
||||
let allow_offloaded_children = true;
|
||||
let set_stopping = false;
|
||||
let (timeline, guard) = DeleteTimelineFlow::prepare(
|
||||
tenant,
|
||||
timeline.timeline_id,
|
||||
allow_offloaded_children,
|
||||
set_stopping,
|
||||
)
|
||||
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
|
||||
let (timeline, guard) =
|
||||
DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
|
||||
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
|
||||
|
||||
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
|
||||
tracing::error!("timeline already offloaded, but given timeline object");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
match timeline.remote_client.shutdown_if_archived().await {
|
||||
Ok(()) => {}
|
||||
Err(ShutdownIfArchivedError::NotInitialized(_)) => {
|
||||
// Either the timeline is being deleted, the operation is being retried, or we are shutting down.
|
||||
// Don't return cancelled here to keep it idempotent.
|
||||
let is_archived = timeline.is_archived();
|
||||
match is_archived {
|
||||
Some(true) => (),
|
||||
Some(false) => {
|
||||
tracing::warn!("tried offloading a non-archived timeline");
|
||||
return Err(OffloadError::NotArchived);
|
||||
}
|
||||
None => {
|
||||
// This is legal: calls to this function can race with the timeline shutting down
|
||||
tracing::info!("tried offloading a timeline whose remote storage is not initialized");
|
||||
return Err(OffloadError::Cancelled);
|
||||
}
|
||||
Err(ShutdownIfArchivedError::NotArchived) => return Err(OffloadError::NotArchived),
|
||||
}
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Reload).await;
|
||||
|
||||
@@ -319,11 +319,27 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
uncommitted: &mut u64,
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
modification.commit(ctx).await?;
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
let status_update = match replication_message {
|
||||
ReplicationMessage::RawInterpretedWalRecords(raw) => {
|
||||
WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);
|
||||
|
||||
let mut uncommitted_records = 0;
|
||||
let mut filtered_records = 0;
|
||||
|
||||
// This is the end LSN of the raw WAL from which the records
|
||||
// were interpreted.
|
||||
@@ -364,23 +380,31 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
let local_next_record_lsn = interpreted.next_record_lsn;
|
||||
|
||||
if interpreted.is_observed() {
|
||||
WAL_INGEST.records_observed.inc();
|
||||
}
|
||||
|
||||
walingest
|
||||
let ingested = walingest
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {local_next_record_lsn}")
|
||||
})?;
|
||||
|
||||
if !ingested {
|
||||
tracing::debug!(
|
||||
"ingest: filtered out record @ LSN {local_next_record_lsn}"
|
||||
);
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
filtered_records += 1;
|
||||
}
|
||||
|
||||
uncommitted_records += 1;
|
||||
|
||||
// FIXME: this cannot be made pausable_failpoint without fixing the
|
||||
@@ -394,8 +418,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
modification.commit(&ctx).await?;
|
||||
uncommitted_records = 0;
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -403,7 +432,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// need to advance last record LSN on all shards. If we've not ingested the latest
|
||||
// record, then set the LSN of the modification past it. This way all shards
|
||||
// advance their last record LSN at the same time.
|
||||
let needs_last_record_lsn_advance = match next_record_lsn {
|
||||
let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
|
||||
Some(lsn) if lsn > modification.get_lsn() => {
|
||||
modification.set_lsn(lsn).unwrap();
|
||||
true
|
||||
@@ -413,7 +442,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
if uncommitted_records > 0 || needs_last_record_lsn_advance {
|
||||
// Commit any uncommitted records
|
||||
modification.commit(&ctx).await?;
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
if !caught_up && streaming_lsn >= end_of_wal {
|
||||
@@ -434,21 +469,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
uncommitted: &mut u64,
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
modification.commit(ctx).await?;
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -308,7 +308,7 @@ impl WalIngest {
|
||||
epoch -= 1;
|
||||
}
|
||||
|
||||
Ok(((epoch as u64) << 32) | xid as u64)
|
||||
Ok((epoch as u64) << 32 | xid as u64)
|
||||
}
|
||||
|
||||
async fn ingest_clear_vm_bits(
|
||||
|
||||
26
pgxn/hnsw/Makefile
Normal file
26
pgxn/hnsw/Makefile
Normal file
@@ -0,0 +1,26 @@
|
||||
EXTENSION = hnsw
|
||||
EXTVERSION = 0.1.0
|
||||
|
||||
MODULE_big = hnsw
|
||||
DATA = $(wildcard *--*.sql)
|
||||
OBJS = hnsw.o hnswalg.o
|
||||
|
||||
TESTS = $(wildcard test/sql/*.sql)
|
||||
REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
|
||||
REGRESS_OPTS = --inputdir=test --load-extension=hnsw
|
||||
|
||||
# For auto-vectorization:
|
||||
# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
|
||||
PG_CFLAGS += -O3
|
||||
PG_CXXFLAGS += -O3 -std=c++11
|
||||
PG_LDFLAGS += -lstdc++
|
||||
|
||||
all: $(EXTENSION)--$(EXTVERSION).sql
|
||||
|
||||
PG_CONFIG ?= pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
|
||||
dist:
|
||||
mkdir -p dist
|
||||
git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
|
||||
25
pgxn/hnsw/README.md
Normal file
25
pgxn/hnsw/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
|
||||
|
||||
This ANN extension of Postgres is based
|
||||
on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
|
||||
the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
|
||||
|
||||
[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
|
||||
<br>
|
||||
Dmitry Baranchuk, Artem Babenko, Yury Malkov
|
||||
|
||||
# Postgres extension
|
||||
|
||||
HNSW index is hold in memory (built on demand) and it's maxial size is limited
|
||||
by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
|
||||
Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
|
||||
described in the article).
|
||||
|
||||
# Example of usage:
|
||||
|
||||
```
|
||||
create extension hnsw;
|
||||
create table embeddings(id integer primary key, payload real[]);
|
||||
create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
|
||||
select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
|
||||
```
|
||||
29
pgxn/hnsw/hnsw--0.1.0.sql
Normal file
29
pgxn/hnsw/hnsw--0.1.0.sql
Normal file
@@ -0,0 +1,29 @@
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
|
||||
|
||||
-- functions
|
||||
|
||||
CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
||||
|
||||
-- operators
|
||||
|
||||
CREATE OPERATOR <-> (
|
||||
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
|
||||
COMMUTATOR = '<->'
|
||||
);
|
||||
|
||||
-- access method
|
||||
|
||||
CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C;
|
||||
|
||||
CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
|
||||
|
||||
COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
|
||||
|
||||
-- opclasses
|
||||
|
||||
CREATE OPERATOR CLASS knn_ops
|
||||
DEFAULT FOR TYPE real[] USING hnsw AS
|
||||
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
|
||||
590
pgxn/hnsw/hnsw.c
Normal file
590
pgxn/hnsw/hnsw.c
Normal file
@@ -0,0 +1,590 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/amapi.h"
|
||||
#include "access/generic_xlog.h"
|
||||
#include "access/relation.h"
|
||||
#include "access/reloptions.h"
|
||||
#include "access/tableam.h"
|
||||
#include "catalog/index.h"
|
||||
#include "commands/vacuum.h"
|
||||
#include "nodes/execnodes.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "utils/guc.h"
|
||||
#include "utils/selfuncs.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#include "hnsw.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
|
||||
typedef struct {
|
||||
int32 vl_len_; /* varlena header (do not touch directly!) */
|
||||
int dims;
|
||||
int maxelements;
|
||||
int efConstruction;
|
||||
int efSearch;
|
||||
int M;
|
||||
} HnswOptions;
|
||||
|
||||
static relopt_kind hnsw_relopt_kind;
|
||||
|
||||
typedef struct {
|
||||
HierarchicalNSW* hnsw;
|
||||
size_t curr;
|
||||
size_t n_results;
|
||||
ItemPointer results;
|
||||
} HnswScanOpaqueData;
|
||||
|
||||
typedef HnswScanOpaqueData* HnswScanOpaque;
|
||||
|
||||
typedef struct {
|
||||
Oid relid;
|
||||
uint32 status;
|
||||
HierarchicalNSW* hnsw;
|
||||
} HnswHashEntry;
|
||||
|
||||
|
||||
#define SH_PREFIX hnsw_index
|
||||
#define SH_ELEMENT_TYPE HnswHashEntry
|
||||
#define SH_KEY_TYPE Oid
|
||||
#define SH_KEY relid
|
||||
#define SH_STORE_HASH
|
||||
#define SH_GET_HASH(tb, a) ((a)->relid)
|
||||
#define SH_HASH_KEY(tb, key) (key)
|
||||
#define SH_EQUAL(tb, a, b) ((a) == (b))
|
||||
#define SH_SCOPE static inline
|
||||
#define SH_DEFINE
|
||||
#define SH_DECLARE
|
||||
#include "lib/simplehash.h"
|
||||
|
||||
#define INDEX_HASH_SIZE 11
|
||||
|
||||
#define DEFAULT_EF_SEARCH 64
|
||||
|
||||
PGDLLEXPORT void _PG_init(void);
|
||||
|
||||
static hnsw_index_hash *hnsw_indexes;
|
||||
|
||||
/*
|
||||
* Initialize index options and variables
|
||||
*/
|
||||
void
|
||||
_PG_init(void)
|
||||
{
|
||||
hnsw_relopt_kind = add_reloption_kind();
|
||||
add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
|
||||
0, 0, INT_MAX, AccessExclusiveLock);
|
||||
add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
|
||||
0, 0, INT_MAX, AccessExclusiveLock);
|
||||
add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
|
||||
100, 0, INT_MAX, AccessExclusiveLock);
|
||||
add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
|
||||
16, 1, INT_MAX, AccessExclusiveLock);
|
||||
add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
|
||||
64, 1, INT_MAX, AccessExclusiveLock);
|
||||
hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
|
||||
bool *isnull, bool tupleIsAlive, void *state)
|
||||
{
|
||||
HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
|
||||
ArrayType* array;
|
||||
int n_items;
|
||||
label_t label = 0;
|
||||
|
||||
/* Skip nulls */
|
||||
if (isnull[0])
|
||||
return;
|
||||
|
||||
array = DatumGetArrayTypeP(values[0]);
|
||||
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
|
||||
if (n_items != hnsw_dimensions(hnsw))
|
||||
{
|
||||
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
|
||||
n_items, hnsw_dimensions(hnsw));
|
||||
}
|
||||
|
||||
memcpy(&label, tid, sizeof(*tid));
|
||||
hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
|
||||
}
|
||||
|
||||
static void
|
||||
hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
|
||||
{
|
||||
IndexInfo* indexInfo = BuildIndexInfo(indexRel);
|
||||
Assert(indexInfo->ii_NumIndexAttrs == 1);
|
||||
table_index_build_scan(heapRel, indexRel, indexInfo,
|
||||
true, true, hnsw_build_callback, (void *) hnsw, NULL);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
|
||||
static void
|
||||
hnsw_check_available_memory(Size requested)
|
||||
{
|
||||
size_t total;
|
||||
if (sysctlbyname("hw.memsize", NULL, &total, NULL, 0) < 0)
|
||||
elog(ERROR, "Failed to get amount of RAM: %m");
|
||||
|
||||
if ((Size)NBuffers*BLCKSZ + requested >= total)
|
||||
elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
|
||||
requested, total - (Size)NBuffers*BLCKSZ);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <sys/sysinfo.h>
|
||||
|
||||
static void
|
||||
hnsw_check_available_memory(Size requested)
|
||||
{
|
||||
struct sysinfo si;
|
||||
Size total;
|
||||
if (sysinfo(&si) < 0)
|
||||
elog(ERROR, "Failed to get amount of RAM: %m");
|
||||
|
||||
total = si.totalram*si.mem_unit;
|
||||
if ((Size)NBuffers*BLCKSZ + requested >= total)
|
||||
elog(ERROR, "HNSW index requeries %ld bytes while only %ld are available",
|
||||
requested, total - (Size)NBuffers*BLCKSZ);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static HierarchicalNSW*
|
||||
hnsw_get_index(Relation indexRel, Relation heapRel)
|
||||
{
|
||||
HierarchicalNSW* hnsw;
|
||||
Oid indexoid = RelationGetRelid(indexRel);
|
||||
HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
|
||||
if (entry == NULL)
|
||||
{
|
||||
size_t dims, maxelements;
|
||||
size_t M;
|
||||
size_t maxM;
|
||||
size_t size_links_level0;
|
||||
size_t size_data_per_element;
|
||||
size_t data_size;
|
||||
dsm_handle handle = indexoid << 1; /* make it even */
|
||||
void* impl_private = NULL;
|
||||
void* mapped_address = NULL;
|
||||
Size mapped_size = 0;
|
||||
Size shmem_size;
|
||||
bool exists = true;
|
||||
bool found;
|
||||
HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
|
||||
if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
|
||||
elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
|
||||
}
|
||||
dims = opts->dims;
|
||||
maxelements = opts->maxelements;
|
||||
M = opts->M;
|
||||
maxM = M * 2;
|
||||
data_size = dims * sizeof(coord_t);
|
||||
size_links_level0 = (maxM + 1) * sizeof(idx_t);
|
||||
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
|
||||
shmem_size = hnsw_sizeof() + maxelements * size_data_per_element;
|
||||
|
||||
hnsw_check_available_memory(shmem_size);
|
||||
|
||||
/* first try to attach to existed index */
|
||||
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
|
||||
&mapped_address, &mapped_size, DEBUG1))
|
||||
{
|
||||
/* index doesn't exists: try to create it */
|
||||
if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
|
||||
&mapped_address, &mapped_size, DEBUG1))
|
||||
{
|
||||
/* We can do it under shared lock, so some other backend may
|
||||
* try to initialize index. If create is failed because index already
|
||||
* created by somebody else, then try to attach to it once again
|
||||
*/
|
||||
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
|
||||
&mapped_address, &mapped_size, ERROR))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
exists = false;
|
||||
}
|
||||
}
|
||||
Assert(mapped_size == shmem_size);
|
||||
hnsw = (HierarchicalNSW*)mapped_address;
|
||||
|
||||
if (!exists)
|
||||
{
|
||||
hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
|
||||
hnsw_populate(hnsw, indexRel, heapRel);
|
||||
}
|
||||
entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
|
||||
Assert(!found);
|
||||
entry->hnsw = hnsw;
|
||||
}
|
||||
else
|
||||
{
|
||||
hnsw = entry->hnsw;
|
||||
}
|
||||
return hnsw;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start or restart an index scan
|
||||
*/
|
||||
static IndexScanDesc
|
||||
hnsw_beginscan(Relation index, int nkeys, int norderbys)
|
||||
{
|
||||
IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
|
||||
HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
|
||||
Relation heap = relation_open(index->rd_index->indrelid, NoLock);
|
||||
so->hnsw = hnsw_get_index(index, heap);
|
||||
relation_close(heap, NoLock);
|
||||
so->curr = 0;
|
||||
so->n_results = 0;
|
||||
so->results = NULL;
|
||||
scan->opaque = so;
|
||||
return scan;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start or restart an index scan
|
||||
*/
|
||||
static void
|
||||
hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
|
||||
{
|
||||
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
|
||||
if (so->results)
|
||||
{
|
||||
pfree(so->results);
|
||||
so->results = NULL;
|
||||
}
|
||||
so->curr = 0;
|
||||
if (orderbys && scan->numberOfOrderBys > 0)
|
||||
memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch the next tuple in the given scan
|
||||
*/
|
||||
static bool
|
||||
hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
|
||||
{
|
||||
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
|
||||
|
||||
/*
|
||||
* Index can be used to scan backward, but Postgres doesn't support
|
||||
* backward scan on operators
|
||||
*/
|
||||
Assert(ScanDirectionIsForward(dir));
|
||||
|
||||
if (so->curr == 0)
|
||||
{
|
||||
Datum value;
|
||||
ArrayType* array;
|
||||
int n_items;
|
||||
size_t n_results;
|
||||
label_t* results;
|
||||
HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
|
||||
size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
|
||||
|
||||
/* Safety check */
|
||||
if (scan->orderByData == NULL)
|
||||
elog(ERROR, "cannot scan HNSW index without order");
|
||||
|
||||
/* No items will match if null */
|
||||
if (scan->orderByData->sk_flags & SK_ISNULL)
|
||||
return false;
|
||||
|
||||
value = scan->orderByData->sk_argument;
|
||||
array = DatumGetArrayTypeP(value);
|
||||
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
|
||||
if (n_items != hnsw_dimensions(so->hnsw))
|
||||
{
|
||||
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
|
||||
n_items, hnsw_dimensions(so->hnsw));
|
||||
}
|
||||
|
||||
if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
|
||||
elog(ERROR, "HNSW index search failed");
|
||||
so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
|
||||
so->n_results = n_results;
|
||||
for (size_t i = 0; i < n_results; i++)
|
||||
{
|
||||
memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
|
||||
}
|
||||
free(results);
|
||||
}
|
||||
if (so->curr >= so->n_results)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
scan->xs_heaptid = so->results[so->curr++];
|
||||
scan->xs_recheckorderby = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* End a scan and release resources
|
||||
*/
|
||||
static void
|
||||
hnsw_endscan(IndexScanDesc scan)
|
||||
{
|
||||
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
|
||||
if (so->results)
|
||||
pfree(so->results);
|
||||
pfree(so);
|
||||
scan->opaque = NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Estimate the cost of an index scan
|
||||
*/
|
||||
static void
|
||||
hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
|
||||
Cost *indexStartupCost, Cost *indexTotalCost,
|
||||
Selectivity *indexSelectivity, double *indexCorrelation
|
||||
,double *indexPages
|
||||
)
|
||||
{
|
||||
GenericCosts costs;
|
||||
|
||||
/* Never use index without order */
|
||||
if (path->indexorderbys == NULL)
|
||||
{
|
||||
*indexStartupCost = DBL_MAX;
|
||||
*indexTotalCost = DBL_MAX;
|
||||
*indexSelectivity = 0;
|
||||
*indexCorrelation = 0;
|
||||
*indexPages = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
MemSet(&costs, 0, sizeof(costs));
|
||||
|
||||
genericcostestimate(root, path, loop_count, &costs);
|
||||
|
||||
/* Startup cost and total cost are same */
|
||||
*indexStartupCost = costs.indexTotalCost;
|
||||
*indexTotalCost = costs.indexTotalCost;
|
||||
*indexSelectivity = costs.indexSelectivity;
|
||||
*indexCorrelation = costs.indexCorrelation;
|
||||
*indexPages = costs.numIndexPages;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse and validate the reloptions
|
||||
*/
|
||||
static bytea *
|
||||
hnsw_options(Datum reloptions, bool validate)
|
||||
{
|
||||
static const relopt_parse_elt tab[] = {
|
||||
{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
|
||||
{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
|
||||
{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
|
||||
{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
|
||||
{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
|
||||
};
|
||||
|
||||
return (bytea *) build_reloptions(reloptions, validate,
|
||||
hnsw_relopt_kind,
|
||||
sizeof(HnswOptions),
|
||||
tab, lengthof(tab));
|
||||
}
|
||||
|
||||
/*
|
||||
* Validate catalog entries for the specified operator class
|
||||
*/
|
||||
static bool
|
||||
hnsw_validate(Oid opclassoid)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build the index for a logged table
|
||||
*/
|
||||
static IndexBuildResult *
|
||||
hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
|
||||
{
|
||||
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
|
||||
IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
|
||||
result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a tuple into the index
|
||||
*/
|
||||
static bool
|
||||
hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
|
||||
Relation heap, IndexUniqueCheck checkUnique,
|
||||
bool indexUnchanged,
|
||||
IndexInfo *indexInfo)
|
||||
{
|
||||
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
|
||||
Datum value;
|
||||
ArrayType* array;
|
||||
int n_items;
|
||||
label_t label = 0;
|
||||
|
||||
/* Skip nulls */
|
||||
if (isnull[0])
|
||||
return false;
|
||||
|
||||
/* Detoast value */
|
||||
value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
|
||||
array = DatumGetArrayTypeP(value);
|
||||
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
|
||||
if (n_items != hnsw_dimensions(hnsw))
|
||||
{
|
||||
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
|
||||
n_items, hnsw_dimensions(hnsw));
|
||||
}
|
||||
memcpy(&label, heap_tid, sizeof(*heap_tid));
|
||||
if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
|
||||
elog(ERROR, "HNSW index insert failed");
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build the index for an unlogged table
|
||||
*/
|
||||
static void
|
||||
hnsw_buildempty(Relation index)
|
||||
{
|
||||
/* index will be constructed on dema nd when accessed */
|
||||
}
|
||||
|
||||
/*
|
||||
* Clean up after a VACUUM operation
|
||||
*/
|
||||
static IndexBulkDeleteResult *
|
||||
hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
|
||||
{
|
||||
Relation rel = info->index;
|
||||
|
||||
if (stats == NULL)
|
||||
return NULL;
|
||||
|
||||
stats->num_pages = RelationGetNumberOfBlocks(rel);
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
/*
|
||||
* Bulk delete tuples from the index
|
||||
*/
|
||||
static IndexBulkDeleteResult *
|
||||
hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
||||
IndexBulkDeleteCallback callback, void *callback_state)
|
||||
{
|
||||
if (stats == NULL)
|
||||
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
|
||||
return stats;
|
||||
}
|
||||
|
||||
/*
|
||||
* Define index handler
|
||||
*
|
||||
* See https://www.postgresql.org/docs/current/index-api.html
|
||||
*/
|
||||
PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
|
||||
Datum
|
||||
hnsw_handler(PG_FUNCTION_ARGS)
|
||||
{
|
||||
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
|
||||
|
||||
amroutine->amstrategies = 0;
|
||||
amroutine->amsupport = 0;
|
||||
amroutine->amoptsprocnum = 0;
|
||||
amroutine->amcanorder = false;
|
||||
amroutine->amcanorderbyop = true;
|
||||
amroutine->amcanbackward = false; /* can change direction mid-scan */
|
||||
amroutine->amcanunique = false;
|
||||
amroutine->amcanmulticol = false;
|
||||
amroutine->amoptionalkey = true;
|
||||
amroutine->amsearcharray = false;
|
||||
amroutine->amsearchnulls = false;
|
||||
amroutine->amstorage = false;
|
||||
amroutine->amclusterable = false;
|
||||
amroutine->ampredlocks = false;
|
||||
amroutine->amcanparallel = false;
|
||||
amroutine->amcaninclude = false;
|
||||
amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
|
||||
amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
|
||||
amroutine->amkeytype = InvalidOid;
|
||||
|
||||
/* Interface functions */
|
||||
amroutine->ambuild = hnsw_build;
|
||||
amroutine->ambuildempty = hnsw_buildempty;
|
||||
amroutine->aminsert = hnsw_insert;
|
||||
amroutine->ambulkdelete = hnsw_bulkdelete;
|
||||
amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
|
||||
amroutine->amcanreturn = NULL; /* tuple not included in heapsort */
|
||||
amroutine->amcostestimate = hnsw_costestimate;
|
||||
amroutine->amoptions = hnsw_options;
|
||||
amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */
|
||||
amroutine->ambuildphasename = NULL;
|
||||
amroutine->amvalidate = hnsw_validate;
|
||||
amroutine->amadjustmembers = NULL;
|
||||
amroutine->ambeginscan = hnsw_beginscan;
|
||||
amroutine->amrescan = hnsw_rescan;
|
||||
amroutine->amgettuple = hnsw_gettuple;
|
||||
amroutine->amgetbitmap = NULL;
|
||||
amroutine->amendscan = hnsw_endscan;
|
||||
amroutine->ammarkpos = NULL;
|
||||
amroutine->amrestrpos = NULL;
|
||||
|
||||
/* Interface functions to support parallel index scans */
|
||||
amroutine->amestimateparallelscan = NULL;
|
||||
amroutine->aminitparallelscan = NULL;
|
||||
amroutine->amparallelrescan = NULL;
|
||||
|
||||
PG_RETURN_POINTER(amroutine);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the L2 distance between vectors
|
||||
*/
|
||||
PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
|
||||
Datum
|
||||
l2_distance(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ArrayType *a = PG_GETARG_ARRAYTYPE_P(0);
|
||||
ArrayType *b = PG_GETARG_ARRAYTYPE_P(1);
|
||||
int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
|
||||
int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
|
||||
dist_t distance = 0.0;
|
||||
dist_t diff;
|
||||
coord_t *ax = (coord_t*)ARR_DATA_PTR(a);
|
||||
coord_t *bx = (coord_t*)ARR_DATA_PTR(b);
|
||||
|
||||
if (a_dim != b_dim)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_EXCEPTION),
|
||||
errmsg("different array dimensions %d and %d", a_dim, b_dim)));
|
||||
}
|
||||
|
||||
for (int i = 0; i < a_dim; i++)
|
||||
{
|
||||
diff = ax[i] - bx[i];
|
||||
distance += diff * diff;
|
||||
}
|
||||
|
||||
PG_RETURN_FLOAT4((dist_t)sqrt(distance));
|
||||
}
|
||||
4
pgxn/hnsw/hnsw.control
Normal file
4
pgxn/hnsw/hnsw.control
Normal file
@@ -0,0 +1,4 @@
|
||||
comment = '** Deprecated ** Please use pg_embedding instead'
|
||||
default_version = '0.1.0'
|
||||
module_pathname = '$libdir/hnsw'
|
||||
relocatable = true
|
||||
15
pgxn/hnsw/hnsw.h
Normal file
15
pgxn/hnsw/hnsw.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
typedef float coord_t;
|
||||
typedef float dist_t;
|
||||
typedef uint32_t idx_t;
|
||||
typedef uint64_t label_t;
|
||||
|
||||
typedef struct HierarchicalNSW HierarchicalNSW;
|
||||
|
||||
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
|
||||
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
|
||||
void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
|
||||
int hnsw_dimensions(HierarchicalNSW* hnsw);
|
||||
size_t hnsw_count(HierarchicalNSW* hnsw);
|
||||
size_t hnsw_sizeof(void);
|
||||
379
pgxn/hnsw/hnswalg.cpp
Normal file
379
pgxn/hnsw/hnswalg.cpp
Normal file
@@ -0,0 +1,379 @@
|
||||
#include "hnswalg.h"
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
|
||||
#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
|
||||
#else
|
||||
#define PORTABLE_ALIGN32 __declspec(align(32))
|
||||
#define PREFETCH(addr,hint)
|
||||
#endif
|
||||
|
||||
HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
|
||||
{
|
||||
dim = dim_;
|
||||
data_size = dim * sizeof(coord_t);
|
||||
|
||||
efConstruction = efConstruction_;
|
||||
|
||||
maxelements = maxelements_;
|
||||
M = M_;
|
||||
maxM = maxM_;
|
||||
size_links_level0 = (maxM + 1) * sizeof(idx_t);
|
||||
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
|
||||
offset_data = size_links_level0;
|
||||
offset_label = offset_data + data_size;
|
||||
|
||||
enterpoint_node = 0;
|
||||
cur_element_count = 0;
|
||||
#ifdef __x86_64__
|
||||
use_avx2 = __builtin_cpu_supports("avx2");
|
||||
#endif
|
||||
}
|
||||
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
|
||||
{
|
||||
std::vector<uint32_t> visited;
|
||||
visited.resize((cur_element_count + 31) >> 5);
|
||||
|
||||
std::priority_queue<std::pair<dist_t, idx_t >> topResults;
|
||||
std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
|
||||
|
||||
dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
|
||||
|
||||
topResults.emplace(dist, enterpoint_node);
|
||||
candidateSet.emplace(-dist, enterpoint_node);
|
||||
visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
|
||||
dist_t lowerBound = dist;
|
||||
|
||||
while (!candidateSet.empty())
|
||||
{
|
||||
std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
|
||||
if (-curr_el_pair.first > lowerBound)
|
||||
break;
|
||||
|
||||
candidateSet.pop();
|
||||
idx_t curNodeNum = curr_el_pair.second;
|
||||
|
||||
idx_t* data = get_linklist0(curNodeNum);
|
||||
size_t size = *data++;
|
||||
|
||||
PREFETCH(getDataByInternalId(*data), 0);
|
||||
|
||||
for (size_t j = 0; j < size; ++j) {
|
||||
size_t tnum = *(data + j);
|
||||
|
||||
PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
|
||||
|
||||
if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
|
||||
visited[tnum >> 5] |= 1 << (tnum & 31);
|
||||
|
||||
dist = fstdistfunc(point, getDataByInternalId(tnum));
|
||||
|
||||
if (topResults.top().first > dist || topResults.size() < ef) {
|
||||
candidateSet.emplace(-dist, tnum);
|
||||
|
||||
PREFETCH(get_linklist0(candidateSet.top().second), 0);
|
||||
topResults.emplace(dist, tnum);
|
||||
|
||||
if (topResults.size() > ef)
|
||||
topResults.pop();
|
||||
|
||||
lowerBound = topResults.top().first;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return topResults;
|
||||
}
|
||||
|
||||
|
||||
void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
|
||||
{
|
||||
if (topResults.size() < NN)
|
||||
return;
|
||||
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
|
||||
std::vector<std::pair<dist_t, idx_t>> returnlist;
|
||||
|
||||
while (topResults.size() > 0) {
|
||||
resultSet.emplace(-topResults.top().first, topResults.top().second);
|
||||
topResults.pop();
|
||||
}
|
||||
|
||||
while (resultSet.size()) {
|
||||
if (returnlist.size() >= NN)
|
||||
break;
|
||||
std::pair<dist_t, idx_t> curen = resultSet.top();
|
||||
dist_t dist_to_query = -curen.first;
|
||||
resultSet.pop();
|
||||
bool good = true;
|
||||
for (std::pair<dist_t, idx_t> curen2 : returnlist) {
|
||||
dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
|
||||
getDataByInternalId(curen.second));
|
||||
if (curdist < dist_to_query) {
|
||||
good = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (good) returnlist.push_back(curen);
|
||||
}
|
||||
for (std::pair<dist_t, idx_t> elem : returnlist)
|
||||
topResults.emplace(-elem.first, elem.second);
|
||||
}
|
||||
|
||||
void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> topResults)
|
||||
{
|
||||
getNeighborsByHeuristic(topResults, M);
|
||||
|
||||
std::vector<idx_t> res;
|
||||
res.reserve(M);
|
||||
while (topResults.size() > 0) {
|
||||
res.push_back(topResults.top().second);
|
||||
topResults.pop();
|
||||
}
|
||||
{
|
||||
idx_t* data = get_linklist0(cur_c);
|
||||
if (*data)
|
||||
throw std::runtime_error("Should be blank");
|
||||
|
||||
*data++ = res.size();
|
||||
|
||||
for (size_t idx = 0; idx < res.size(); idx++) {
|
||||
if (data[idx])
|
||||
throw std::runtime_error("Should be blank");
|
||||
data[idx] = res[idx];
|
||||
}
|
||||
}
|
||||
for (size_t idx = 0; idx < res.size(); idx++) {
|
||||
if (res[idx] == cur_c)
|
||||
throw std::runtime_error("Connection to the same element");
|
||||
|
||||
size_t resMmax = maxM;
|
||||
idx_t *ll_other = get_linklist0(res[idx]);
|
||||
idx_t sz_link_list_other = *ll_other;
|
||||
|
||||
if (sz_link_list_other > resMmax || sz_link_list_other < 0)
|
||||
throw std::runtime_error("Bad sz_link_list_other");
|
||||
|
||||
if (sz_link_list_other < resMmax) {
|
||||
idx_t *data = ll_other + 1;
|
||||
data[sz_link_list_other] = cur_c;
|
||||
*ll_other = sz_link_list_other + 1;
|
||||
} else {
|
||||
// finding the "weakest" element to replace it with the new one
|
||||
idx_t *data = ll_other + 1;
|
||||
dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
|
||||
// Heuristic:
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> candidates;
|
||||
candidates.emplace(d_max, cur_c);
|
||||
|
||||
for (size_t j = 0; j < sz_link_list_other; j++)
|
||||
candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
|
||||
|
||||
getNeighborsByHeuristic(candidates, resMmax);
|
||||
|
||||
size_t indx = 0;
|
||||
while (!candidates.empty()) {
|
||||
data[indx] = candidates.top().second;
|
||||
candidates.pop();
|
||||
indx++;
|
||||
}
|
||||
*ll_other = indx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
|
||||
{
|
||||
if (cur_element_count >= maxelements) {
|
||||
throw std::runtime_error("The number of elements exceeds the specified limit");
|
||||
}
|
||||
idx_t cur_c = cur_element_count++;
|
||||
memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
|
||||
memcpy(getDataByInternalId(cur_c), point, data_size);
|
||||
memcpy(getExternalLabel(cur_c), &label, sizeof label);
|
||||
|
||||
// Do nothing for the first element
|
||||
if (cur_c != 0) {
|
||||
std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
|
||||
mutuallyConnectNewElement(point, cur_c, topResults);
|
||||
}
|
||||
};
|
||||
|
||||
std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
|
||||
{
|
||||
std::priority_queue<std::pair<dist_t, label_t>> topResults;
|
||||
auto topCandidates = searchBaseLayer(query, k);
|
||||
while (topCandidates.size() > k) {
|
||||
topCandidates.pop();
|
||||
}
|
||||
while (!topCandidates.empty()) {
|
||||
std::pair<dist_t, idx_t> rez = topCandidates.top();
|
||||
label_t label;
|
||||
memcpy(&label, getExternalLabel(rez.second), sizeof(label));
|
||||
topResults.push(std::pair<dist_t, label_t>(rez.first, label));
|
||||
topCandidates.pop();
|
||||
}
|
||||
|
||||
return topResults;
|
||||
};
|
||||
|
||||
dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
|
||||
{
|
||||
dist_t distance = 0.0;
|
||||
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
dist_t diff = x[i] - y[i];
|
||||
distance += diff * diff;
|
||||
}
|
||||
return distance;
|
||||
|
||||
}
|
||||
|
||||
#ifdef __x86_64__
|
||||
#include <immintrin.h>
|
||||
|
||||
__attribute__((target("avx2")))
|
||||
dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
|
||||
{
|
||||
const size_t TmpResSz = sizeof(__m256) / sizeof(float);
|
||||
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
|
||||
size_t qty16 = n / 16;
|
||||
const float *pEnd1 = x + (qty16 * 16);
|
||||
__m256 diff, v1, v2;
|
||||
__m256 sum = _mm256_set1_ps(0);
|
||||
|
||||
while (x < pEnd1) {
|
||||
v1 = _mm256_loadu_ps(x);
|
||||
x += 8;
|
||||
v2 = _mm256_loadu_ps(y);
|
||||
y += 8;
|
||||
diff = _mm256_sub_ps(v1, v2);
|
||||
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
|
||||
|
||||
v1 = _mm256_loadu_ps(x);
|
||||
x += 8;
|
||||
v2 = _mm256_loadu_ps(y);
|
||||
y += 8;
|
||||
diff = _mm256_sub_ps(v1, v2);
|
||||
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
|
||||
}
|
||||
_mm256_store_ps(TmpRes, sum);
|
||||
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
|
||||
return (res);
|
||||
}
|
||||
|
||||
dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
|
||||
{
|
||||
const size_t TmpResSz = sizeof(__m128) / sizeof(float);
|
||||
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
|
||||
size_t qty16 = n / 16;
|
||||
const float *pEnd1 = x + (qty16 * 16);
|
||||
|
||||
__m128 diff, v1, v2;
|
||||
__m128 sum = _mm_set1_ps(0);
|
||||
|
||||
while (x < pEnd1) {
|
||||
v1 = _mm_loadu_ps(x);
|
||||
x += 4;
|
||||
v2 = _mm_loadu_ps(y);
|
||||
y += 4;
|
||||
diff = _mm_sub_ps(v1, v2);
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
||||
|
||||
v1 = _mm_loadu_ps(x);
|
||||
x += 4;
|
||||
v2 = _mm_loadu_ps(y);
|
||||
y += 4;
|
||||
diff = _mm_sub_ps(v1, v2);
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
||||
|
||||
v1 = _mm_loadu_ps(x);
|
||||
x += 4;
|
||||
v2 = _mm_loadu_ps(y);
|
||||
y += 4;
|
||||
diff = _mm_sub_ps(v1, v2);
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
||||
|
||||
v1 = _mm_loadu_ps(x);
|
||||
x += 4;
|
||||
v2 = _mm_loadu_ps(y);
|
||||
y += 4;
|
||||
diff = _mm_sub_ps(v1, v2);
|
||||
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
|
||||
}
|
||||
_mm_store_ps(TmpRes, sum);
|
||||
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
|
||||
return res;
|
||||
}
|
||||
#endif
|
||||
|
||||
dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
|
||||
{
|
||||
#ifndef __x86_64__
|
||||
return fstdistfunc_scalar(x, y, dim);
|
||||
#else
|
||||
if(use_avx2)
|
||||
return fstdistfunc_avx2(x, y, dim);
|
||||
|
||||
return fstdistfunc_sse(x, y, dim);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
|
||||
{
|
||||
try
|
||||
{
|
||||
auto result = hnsw->searchKnn(point, efSearch);
|
||||
size_t nResults = result.size();
|
||||
*results = (label_t*)malloc(nResults*sizeof(label_t));
|
||||
for (size_t i = nResults; i-- != 0;)
|
||||
{
|
||||
(*results)[i] = result.top().second;
|
||||
result.pop();
|
||||
}
|
||||
*n_results = nResults;
|
||||
return true;
|
||||
}
|
||||
catch (std::exception& x)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
|
||||
{
|
||||
try
|
||||
{
|
||||
hnsw->addPoint(point, label);
|
||||
return true;
|
||||
}
|
||||
catch (std::exception& x)
|
||||
{
|
||||
fprintf(stderr, "Catch %s\n", x.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
|
||||
{
|
||||
new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
|
||||
}
|
||||
|
||||
|
||||
int hnsw_dimensions(HierarchicalNSW* hnsw)
|
||||
{
|
||||
return (int)hnsw->dim;
|
||||
}
|
||||
|
||||
size_t hnsw_count(HierarchicalNSW* hnsw)
|
||||
{
|
||||
return hnsw->cur_element_count;
|
||||
}
|
||||
|
||||
size_t hnsw_sizeof(void)
|
||||
{
|
||||
return sizeof(HierarchicalNSW);
|
||||
}
|
||||
69
pgxn/hnsw/hnswalg.h
Normal file
69
pgxn/hnsw/hnswalg.h
Normal file
@@ -0,0 +1,69 @@
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <map>
|
||||
#include <cmath>
|
||||
#include <queue>
|
||||
#include <stdexcept>
|
||||
|
||||
extern "C" {
|
||||
#include "hnsw.h"
|
||||
}
|
||||
|
||||
struct HierarchicalNSW
|
||||
{
|
||||
size_t maxelements;
|
||||
size_t cur_element_count;
|
||||
|
||||
idx_t enterpoint_node;
|
||||
|
||||
size_t dim;
|
||||
size_t data_size;
|
||||
size_t offset_data;
|
||||
size_t offset_label;
|
||||
size_t size_data_per_element;
|
||||
size_t M;
|
||||
size_t maxM;
|
||||
size_t size_links_level0;
|
||||
size_t efConstruction;
|
||||
|
||||
#ifdef __x86_64__
|
||||
bool use_avx2;
|
||||
#endif
|
||||
|
||||
char data_level0_memory[0]; // varying size
|
||||
|
||||
public:
|
||||
HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
|
||||
~HierarchicalNSW();
|
||||
|
||||
|
||||
inline coord_t *getDataByInternalId(idx_t internal_id) const {
|
||||
return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
|
||||
}
|
||||
|
||||
inline idx_t *get_linklist0(idx_t internal_id) const {
|
||||
return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
|
||||
}
|
||||
|
||||
inline label_t *getExternalLabel(idx_t internal_id) const {
|
||||
return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
|
||||
}
|
||||
|
||||
std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
|
||||
|
||||
void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
|
||||
|
||||
void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
|
||||
|
||||
void addPoint(const coord_t *point, label_t label);
|
||||
|
||||
std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
|
||||
|
||||
dist_t fstdistfunc(const coord_t *x, const coord_t *y);
|
||||
};
|
||||
28
pgxn/hnsw/test/expected/knn.out
Normal file
28
pgxn/hnsw/test/expected/knn.out
Normal file
@@ -0,0 +1,28 @@
|
||||
SET enable_seqscan = off;
|
||||
CREATE TABLE t (val real[]);
|
||||
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
|
||||
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
|
||||
INSERT INTO t (val) VALUES (array[1,2,4]);
|
||||
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
|
||||
QUERY PLAN
|
||||
--------------------------------------------------------------------
|
||||
Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36)
|
||||
Order By: (val <-> '{3,3,3}'::real[])
|
||||
(2 rows)
|
||||
|
||||
SELECT * FROM t ORDER BY val <-> array[3,3,3];
|
||||
val
|
||||
---------
|
||||
{1,2,3}
|
||||
{1,2,4}
|
||||
{1,1,1}
|
||||
{0,0,0}
|
||||
(4 rows)
|
||||
|
||||
SELECT COUNT(*) FROM t;
|
||||
count
|
||||
-------
|
||||
5
|
||||
(1 row)
|
||||
|
||||
DROP TABLE t;
|
||||
13
pgxn/hnsw/test/sql/knn.sql
Normal file
13
pgxn/hnsw/test/sql/knn.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
SET enable_seqscan = off;
|
||||
|
||||
CREATE TABLE t (val real[]);
|
||||
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
|
||||
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
|
||||
|
||||
INSERT INTO t (val) VALUES (array[1,2,4]);
|
||||
|
||||
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
|
||||
SELECT * FROM t ORDER BY val <-> array[3,3,3];
|
||||
SELECT COUNT(*) FROM t;
|
||||
|
||||
DROP TABLE t;
|
||||
@@ -556,9 +556,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
|
||||
switch (neon_protocol_version)
|
||||
{
|
||||
case 3:
|
||||
pagestream_query = psprintf("pagestream_v3 %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
case 2:
|
||||
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
@@ -1138,9 +1135,9 @@ pg_init_libpagestore(void)
|
||||
"Version of compute<->page server protocol",
|
||||
NULL,
|
||||
&neon_protocol_version,
|
||||
2, /* use protocol version 2 */
|
||||
2, /* min */
|
||||
3, /* max */
|
||||
2, /* use protocol version 2 */
|
||||
2, /* min */
|
||||
2, /* max */
|
||||
PGC_SU_BACKEND,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
@@ -44,15 +44,10 @@ typedef enum
|
||||
T_NeonGetSlruSegmentResponse,
|
||||
} NeonMessageTag;
|
||||
|
||||
typedef uint64 NeonRequestId;
|
||||
|
||||
/* base struct for c-style inheritance */
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
NeonRequestId reqid;
|
||||
XLogRecPtr lsn;
|
||||
XLogRecPtr not_modified_since;
|
||||
} NeonMessage;
|
||||
|
||||
#define messageTag(m) (((const NeonMessage *)(m))->tag)
|
||||
@@ -72,7 +67,6 @@ typedef enum {
|
||||
SLRU_MULTIXACT_OFFSETS
|
||||
} SlruKind;
|
||||
|
||||
|
||||
/*--
|
||||
* supertype of all the Neon*Request structs below.
|
||||
*
|
||||
@@ -93,37 +87,37 @@ typedef enum {
|
||||
*
|
||||
* These structs describe the V2 of these requests. (The old now-defunct V1
|
||||
* protocol contained just one LSN and a boolean 'latest' flag.)
|
||||
*
|
||||
* V3 version of protocol adds request ID to all requests. This request ID is also included in response
|
||||
* as well as other fields from requests, which allows to verify that we receive response for our request.
|
||||
* We copy fields from request to response to make checking more reliable: request ID is formed from process ID
|
||||
* and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
|
||||
*/
|
||||
typedef NeonMessage NeonRequest;
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
XLogRecPtr lsn;
|
||||
XLogRecPtr not_modified_since;
|
||||
} NeonRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest hdr;
|
||||
NeonRequest req;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
} NeonExistsRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest hdr;
|
||||
NeonRequest req;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
} NeonNblocksRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest hdr;
|
||||
NeonRequest req;
|
||||
Oid dbNode;
|
||||
} NeonDbSizeRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest hdr;
|
||||
NeonRequest req;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blkno;
|
||||
@@ -131,29 +125,32 @@ typedef struct
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest hdr;
|
||||
SlruKind kind;
|
||||
int segno;
|
||||
NeonRequest req;
|
||||
SlruKind kind;
|
||||
int segno;
|
||||
} NeonGetSlruSegmentRequest;
|
||||
|
||||
/* supertype of all the Neon*Response structs below */
|
||||
typedef NeonMessage NeonResponse;
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
} NeonResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonExistsRequest req;
|
||||
NeonMessageTag tag;
|
||||
bool exists;
|
||||
} NeonExistsResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonNblocksRequest req;
|
||||
NeonMessageTag tag;
|
||||
uint32 n_blocks;
|
||||
} NeonNblocksResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonGetPageRequest req;
|
||||
NeonMessageTag tag;
|
||||
char page[FLEXIBLE_ARRAY_MEMBER];
|
||||
} NeonGetPageResponse;
|
||||
|
||||
@@ -161,21 +158,21 @@ typedef struct
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonDbSizeRequest req;
|
||||
NeonMessageTag tag;
|
||||
int64 db_size;
|
||||
} NeonDbSizeResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonResponse req;
|
||||
NeonMessageTag tag;
|
||||
char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
|
||||
* message */
|
||||
} NeonErrorResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonGetSlruSegmentRequest req;
|
||||
int n_blocks;
|
||||
NeonMessageTag tag;
|
||||
int n_blocks;
|
||||
char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT];
|
||||
} NeonGetSlruSegmentResponse;
|
||||
|
||||
|
||||
@@ -120,9 +120,6 @@ static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block
|
||||
|
||||
static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
|
||||
|
||||
static uint32 local_request_counter;
|
||||
#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
|
||||
|
||||
/*
|
||||
* Prefetch implementation:
|
||||
*
|
||||
@@ -191,11 +188,15 @@ typedef struct PrefetchRequest
|
||||
uint8 status; /* see PrefetchStatus for valid values */
|
||||
uint8 flags; /* see PrefetchRequestFlags */
|
||||
neon_request_lsns request_lsns;
|
||||
NeonRequestId reqid;
|
||||
NeonResponse *response; /* may be null */
|
||||
uint64 my_ring_index;
|
||||
} PrefetchRequest;
|
||||
|
||||
StaticAssertDecl(sizeof(PrefetchRequest) == 64,
|
||||
"We prefer to have a power-of-2 size for this struct. Please"
|
||||
" try to find an alternative solution before reaching to"
|
||||
" increase the expected size here");
|
||||
|
||||
/* prefetch buffer lookup hash table */
|
||||
|
||||
typedef struct PrfHashEntry
|
||||
@@ -364,7 +365,6 @@ compact_prefetch_buffers(void)
|
||||
target_slot->shard_no = source_slot->shard_no;
|
||||
target_slot->status = source_slot->status;
|
||||
target_slot->response = source_slot->response;
|
||||
target_slot->reqid = source_slot->reqid;
|
||||
target_slot->request_lsns = source_slot->request_lsns;
|
||||
target_slot->my_ring_index = empty_ring_index;
|
||||
|
||||
@@ -798,8 +798,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
|
||||
|
||||
NeonGetPageRequest request = {
|
||||
.hdr.tag = T_NeonGetPageRequest,
|
||||
.hdr.reqid = GENERATE_REQUEST_ID(),
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
/* lsn and not_modified_since are filled in below */
|
||||
.rinfo = BufTagGetNRelFileInfo(slot->buftag),
|
||||
.forknum = slot->buftag.forkNum,
|
||||
@@ -808,16 +807,14 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
|
||||
|
||||
Assert(mySlotNo == MyPState->ring_unused);
|
||||
|
||||
slot->reqid = request.hdr.reqid;
|
||||
|
||||
if (force_request_lsns)
|
||||
slot->request_lsns = *force_request_lsns;
|
||||
else
|
||||
neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
|
||||
slot->buftag.forkNum, slot->buftag.blockNum,
|
||||
&slot->request_lsns, 1, NULL);
|
||||
request.hdr.lsn = slot->request_lsns.request_lsn;
|
||||
request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
|
||||
request.req.lsn = slot->request_lsns.request_lsn;
|
||||
request.req.not_modified_since = slot->request_lsns.not_modified_since;
|
||||
|
||||
Assert(slot->response == NULL);
|
||||
Assert(slot->my_ring_index == MyPState->ring_unused);
|
||||
@@ -1105,12 +1102,6 @@ Retry:
|
||||
return min_ring_index;
|
||||
}
|
||||
|
||||
static bool
|
||||
equal_requests(NeonRequest* a, NeonRequest* b)
|
||||
{
|
||||
return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Note: this function can get canceled and use a long jump to the next catch
|
||||
@@ -1193,10 +1184,6 @@ nm_pack_request(NeonRequest *msg)
|
||||
initStringInfo(&s);
|
||||
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
pq_sendint64(&s, msg->reqid);
|
||||
}
|
||||
pq_sendint64(&s, msg->lsn);
|
||||
pq_sendint64(&s, msg->not_modified_since);
|
||||
|
||||
@@ -1274,16 +1261,8 @@ NeonResponse *
|
||||
nm_unpack_response(StringInfo s)
|
||||
{
|
||||
NeonMessageTag tag = pq_getmsgbyte(s);
|
||||
NeonResponse resp_hdr = {0}; /* make valgrind happy */
|
||||
NeonResponse *resp = NULL;
|
||||
|
||||
resp_hdr.tag = tag;
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
resp_hdr.reqid = pq_getmsgint64(s);
|
||||
resp_hdr.lsn = pq_getmsgint64(s);
|
||||
resp_hdr.not_modified_since = pq_getmsgint64(s);
|
||||
}
|
||||
switch (tag)
|
||||
{
|
||||
/* pagestore -> pagestore_client */
|
||||
@@ -1291,14 +1270,7 @@ nm_unpack_response(StringInfo s)
|
||||
{
|
||||
NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse));
|
||||
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
msg_resp->req.forknum = pq_getmsgbyte(s);
|
||||
}
|
||||
msg_resp->req.hdr = resp_hdr;
|
||||
msg_resp->tag = tag;
|
||||
msg_resp->exists = pq_getmsgbyte(s);
|
||||
pq_getmsgend(s);
|
||||
|
||||
@@ -1310,14 +1282,7 @@ nm_unpack_response(StringInfo s)
|
||||
{
|
||||
NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse));
|
||||
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
msg_resp->req.forknum = pq_getmsgbyte(s);
|
||||
}
|
||||
msg_resp->req.hdr = resp_hdr;
|
||||
msg_resp->tag = tag;
|
||||
msg_resp->n_blocks = pq_getmsgint(s, 4);
|
||||
pq_getmsgend(s);
|
||||
|
||||
@@ -1330,20 +1295,12 @@ nm_unpack_response(StringInfo s)
|
||||
NeonGetPageResponse *msg_resp;
|
||||
|
||||
msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
|
||||
msg_resp->req.forknum = pq_getmsgbyte(s);
|
||||
msg_resp->req.blkno = pq_getmsgint(s, 4);
|
||||
}
|
||||
msg_resp->req.hdr = resp_hdr;
|
||||
msg_resp->tag = tag;
|
||||
/* XXX: should be varlena */
|
||||
memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
|
||||
pq_getmsgend(s);
|
||||
|
||||
Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse);
|
||||
Assert(msg_resp->tag == T_NeonGetPageResponse);
|
||||
|
||||
resp = (NeonResponse *) msg_resp;
|
||||
break;
|
||||
@@ -1353,11 +1310,7 @@ nm_unpack_response(StringInfo s)
|
||||
{
|
||||
NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse));
|
||||
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
msg_resp->req.dbNode = pq_getmsgint(s, 4);
|
||||
}
|
||||
msg_resp->req.hdr = resp_hdr;
|
||||
msg_resp->tag = tag;
|
||||
msg_resp->db_size = pq_getmsgint64(s);
|
||||
pq_getmsgend(s);
|
||||
|
||||
@@ -1375,7 +1328,7 @@ nm_unpack_response(StringInfo s)
|
||||
msglen = strlen(msgtext);
|
||||
|
||||
msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1);
|
||||
msg_resp->req = resp_hdr;
|
||||
msg_resp->tag = tag;
|
||||
memcpy(msg_resp->message, msgtext, msglen + 1);
|
||||
pq_getmsgend(s);
|
||||
|
||||
@@ -1386,17 +1339,9 @@ nm_unpack_response(StringInfo s)
|
||||
case T_NeonGetSlruSegmentResponse:
|
||||
{
|
||||
NeonGetSlruSegmentResponse *msg_resp;
|
||||
int n_blocks;
|
||||
msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse));
|
||||
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
msg_resp->req.kind = pq_getmsgbyte(s);
|
||||
msg_resp->req.segno = pq_getmsgint(s, 4);
|
||||
}
|
||||
msg_resp->req.hdr = resp_hdr;
|
||||
|
||||
n_blocks = pq_getmsgint(s, 4);
|
||||
int n_blocks = pq_getmsgint(s, 4);
|
||||
msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse));
|
||||
msg_resp->tag = tag;
|
||||
msg_resp->n_blocks = n_blocks;
|
||||
memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
|
||||
pq_getmsgend(s);
|
||||
@@ -1441,8 +1386,8 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
|
||||
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1454,8 +1399,8 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
|
||||
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1468,8 +1413,8 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1479,8 +1424,8 @@ nm_to_string(NeonMessage *msg)
|
||||
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
|
||||
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1491,8 +1436,8 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
|
||||
appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
|
||||
appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -2367,64 +2312,39 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1, NULL);
|
||||
{
|
||||
NeonExistsRequest request = {
|
||||
.hdr.tag = T_NeonExistsRequest,
|
||||
.hdr.reqid = GENERATE_REQUEST_ID(),
|
||||
.hdr.lsn = request_lsns.request_lsn,
|
||||
.hdr.not_modified_since = request_lsns.not_modified_since,
|
||||
.req.tag = T_NeonExistsRequest,
|
||||
.req.lsn = request_lsns.request_lsn,
|
||||
.req.not_modified_since = request_lsns.not_modified_since,
|
||||
.rinfo = InfoFromSMgrRel(reln),
|
||||
.forknum = forkNum
|
||||
};
|
||||
|
||||
resp = page_server_request(&request);
|
||||
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonExistsResponse:
|
||||
{
|
||||
NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp;
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr) ||
|
||||
!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
|
||||
exists_resp->req.forknum != request.forknum)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
|
||||
}
|
||||
}
|
||||
exists = exists_resp->exists;
|
||||
break;
|
||||
}
|
||||
case T_NeonErrorResponse:
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr))
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
resp->reqid,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
|
||||
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
}
|
||||
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonExistsResponse:
|
||||
exists = ((NeonExistsResponse *) resp)->exists;
|
||||
break;
|
||||
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forkNum,
|
||||
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
|
||||
T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
pfree(resp);
|
||||
return exists;
|
||||
}
|
||||
|
||||
@@ -3032,43 +2952,15 @@ Retry:
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonGetPageResponse:
|
||||
{
|
||||
NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (resp->reqid != slot->reqid ||
|
||||
resp->lsn != slot->request_lsns.request_lsn ||
|
||||
resp->not_modified_since != slot->request_lsns.not_modified_since ||
|
||||
!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
|
||||
getpage_resp->req.forknum != forkNum ||
|
||||
getpage_resp->req.blkno != base_blockno + i)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
|
||||
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
|
||||
}
|
||||
}
|
||||
memcpy(buffer, getpage_resp->page, BLCKSZ);
|
||||
memcpy(buffer, ((NeonGetPageResponse *) resp)->page, BLCKSZ);
|
||||
lfc_write(rinfo, forkNum, blockno, buffer);
|
||||
break;
|
||||
}
|
||||
|
||||
case T_NeonErrorResponse:
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (resp->reqid != slot->reqid ||
|
||||
resp->lsn != slot->request_lsns.request_lsn ||
|
||||
resp->not_modified_since != slot->request_lsns.not_modified_since)
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
|
||||
errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
slot->shard_no, blockno, RelFileInfoFmt(rinfo),
|
||||
forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
@@ -3551,72 +3443,47 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
|
||||
{
|
||||
NeonNblocksRequest request = {
|
||||
.hdr.tag = T_NeonNblocksRequest,
|
||||
.hdr.reqid = GENERATE_REQUEST_ID(),
|
||||
.hdr.lsn = request_lsns.request_lsn,
|
||||
.hdr.not_modified_since = request_lsns.not_modified_since,
|
||||
.req.tag = T_NeonNblocksRequest,
|
||||
.req.lsn = request_lsns.request_lsn,
|
||||
.req.not_modified_since = request_lsns.not_modified_since,
|
||||
.rinfo = InfoFromSMgrRel(reln),
|
||||
.forknum = forknum,
|
||||
};
|
||||
|
||||
resp = page_server_request(&request);
|
||||
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonNblocksResponse:
|
||||
{
|
||||
NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp;
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr) ||
|
||||
!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
|
||||
relsize_resp->req.forknum != forknum)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
|
||||
}
|
||||
}
|
||||
n_blocks = relsize_resp->n_blocks;
|
||||
break;
|
||||
}
|
||||
case T_NeonErrorResponse:
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr))
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
resp->reqid,
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum,
|
||||
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
|
||||
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
|
||||
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum,
|
||||
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
|
||||
n_blocks);
|
||||
|
||||
pfree(resp);
|
||||
}
|
||||
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonNblocksResponse:
|
||||
n_blocks = ((NeonNblocksResponse *) resp)->n_blocks;
|
||||
break;
|
||||
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum,
|
||||
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
|
||||
T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
|
||||
|
||||
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)),
|
||||
forknum,
|
||||
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
|
||||
n_blocks);
|
||||
|
||||
pfree(resp);
|
||||
return n_blocks;
|
||||
}
|
||||
|
||||
@@ -3636,64 +3503,40 @@ neon_dbsize(Oid dbNode)
|
||||
|
||||
{
|
||||
NeonDbSizeRequest request = {
|
||||
.hdr.tag = T_NeonDbSizeRequest,
|
||||
.hdr.reqid = GENERATE_REQUEST_ID(),
|
||||
.hdr.lsn = request_lsns.request_lsn,
|
||||
.hdr.not_modified_since = request_lsns.not_modified_since,
|
||||
.req.tag = T_NeonDbSizeRequest,
|
||||
.req.lsn = request_lsns.request_lsn,
|
||||
.req.not_modified_since = request_lsns.not_modified_since,
|
||||
.dbNode = dbNode,
|
||||
};
|
||||
|
||||
resp = page_server_request(&request);
|
||||
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonDbSizeResponse:
|
||||
{
|
||||
NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp;
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr) ||
|
||||
dbsize_resp->req.dbNode != dbNode)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
|
||||
}
|
||||
}
|
||||
db_size = dbsize_resp->db_size;
|
||||
break;
|
||||
}
|
||||
case T_NeonErrorResponse:
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr))
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
|
||||
resp->reqid,
|
||||
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
|
||||
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
|
||||
|
||||
pfree(resp);
|
||||
}
|
||||
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonDbSizeResponse:
|
||||
db_size = ((NeonDbSizeResponse *) resp)->db_size;
|
||||
break;
|
||||
|
||||
case T_NeonErrorResponse:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
|
||||
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
break;
|
||||
|
||||
default:
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
|
||||
T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
|
||||
}
|
||||
|
||||
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
|
||||
|
||||
pfree(resp);
|
||||
return db_size;
|
||||
}
|
||||
|
||||
@@ -4025,17 +3868,16 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
return -1;
|
||||
|
||||
request = (NeonGetSlruSegmentRequest) {
|
||||
.hdr.tag = T_NeonGetSlruSegmentRequest,
|
||||
.hdr.reqid = GENERATE_REQUEST_ID(),
|
||||
.hdr.lsn = request_lsn,
|
||||
.hdr.not_modified_since = not_modified_since,
|
||||
.req.tag = T_NeonGetSlruSegmentRequest,
|
||||
.req.lsn = request_lsn,
|
||||
.req.not_modified_since = not_modified_since,
|
||||
.kind = kind,
|
||||
.segno = segno
|
||||
};
|
||||
|
||||
do
|
||||
{
|
||||
while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
|
||||
while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
|
||||
|
||||
consume_prefetch_responses();
|
||||
|
||||
@@ -4045,38 +3887,14 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
switch (resp->tag)
|
||||
{
|
||||
case T_NeonGetSlruSegmentResponse:
|
||||
{
|
||||
NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp;
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr) ||
|
||||
slru_resp->req.kind != kind ||
|
||||
slru_resp->req.segno != segno)
|
||||
{
|
||||
NEON_PANIC_CONNECTION_STATE(-1, PANIC,
|
||||
"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno);
|
||||
}
|
||||
}
|
||||
n_blocks = slru_resp->n_blocks;
|
||||
memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ);
|
||||
n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks;
|
||||
memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ);
|
||||
break;
|
||||
}
|
||||
|
||||
case T_NeonErrorResponse:
|
||||
if (neon_protocol_version >= 3)
|
||||
{
|
||||
if (!equal_requests(resp, &request.hdr))
|
||||
{
|
||||
elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
|
||||
resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
|
||||
request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
|
||||
}
|
||||
}
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X",
|
||||
resp->reqid,
|
||||
errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X",
|
||||
kind,
|
||||
segno,
|
||||
LSN_FORMAT_ARGS(request_lsn)),
|
||||
@@ -4215,9 +4033,8 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
NeonResponse *response;
|
||||
NeonNblocksResponse *nbresponse;
|
||||
NeonNblocksRequest request = {
|
||||
.hdr = (NeonRequest) {
|
||||
.req = (NeonRequest) {
|
||||
.tag = T_NeonNblocksRequest,
|
||||
.reqid = GENERATE_REQUEST_ID(),
|
||||
.lsn = end_recptr,
|
||||
.not_modified_since = end_recptr,
|
||||
},
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user