mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-21 12:22:56 +00:00
Compare commits
272 Commits
rustls
...
release-45
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b05fe53cfd | ||
|
|
f28bdb6528 | ||
|
|
1c037209c7 | ||
|
|
e5a3b6dfd8 | ||
|
|
136aab5479 | ||
|
|
6e40900569 | ||
|
|
ddc431fc8f | ||
|
|
bfc98f36e3 | ||
|
|
d5fbfe2399 | ||
|
|
1f1c50e8c7 | ||
|
|
854df0f566 | ||
|
|
9c493869c7 | ||
|
|
df760e6de5 | ||
|
|
14913c6443 | ||
|
|
cdb08f0362 | ||
|
|
572bc06011 | ||
|
|
a7342b3897 | ||
|
|
e68ae2888a | ||
|
|
83000b3824 | ||
|
|
a21b719770 | ||
|
|
1dff98be84 | ||
|
|
7d6fc3c826 | ||
|
|
61b6c4cf30 | ||
|
|
f93d15f781 | ||
|
|
5385791ca6 | ||
|
|
2df3602a4b | ||
|
|
48890d206e | ||
|
|
baa1323b4a | ||
|
|
48f156b8a2 | ||
|
|
ac38d3a88c | ||
|
|
0f56104a61 | ||
|
|
f260f1565e | ||
|
|
c29df80634 | ||
|
|
58dbca6ce3 | ||
|
|
613906acea | ||
|
|
82809d2ec2 | ||
|
|
0bd79eb063 | ||
|
|
8ff5387da1 | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
2
.config/nextest.toml
Normal file
2
.config/nextest.toml
Normal file
@@ -0,0 +1,2 @@
|
||||
[profile.default]
|
||||
slow-timeout = "1m"
|
||||
105
.github/workflows/build_and_push_docker_image.yml
vendored
Normal file
105
.github/workflows/build_and_push_docker_image.yml
vendored
Normal file
@@ -0,0 +1,105 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dockerfile-path:
|
||||
required: true
|
||||
type: string
|
||||
image-name:
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
build-tools-tag:
|
||||
description: "tag generated for build tools"
|
||||
value: ${{ jobs.tag.outputs.build-tools-tag }}
|
||||
|
||||
jobs:
|
||||
check-if-build-tools-dockerfile-changed:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
|
||||
steps:
|
||||
- name: Check if Dockerfile.buildtools has changed
|
||||
id: dockerfile
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
|
||||
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
|
||||
exit
|
||||
fi
|
||||
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
|
||||
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
|
||||
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
tag:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ check-if-build-tools-dockerfile-changed ]
|
||||
outputs:
|
||||
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
|
||||
|
||||
steps:
|
||||
- name: Get buildtools tag
|
||||
env:
|
||||
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
|
||||
IMAGE_TAG=$GITHUB_RUN_ID
|
||||
else
|
||||
IMAGE_TAG=pinned
|
||||
fi
|
||||
|
||||
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
|
||||
shell: bash
|
||||
id: buildtools-tag
|
||||
|
||||
kaniko:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
|
||||
kaniko-arm:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
manifest:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
name: 'manifest'
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs:
|
||||
- tag
|
||||
- kaniko
|
||||
- kaniko-arm
|
||||
- check-if-build-tools-dockerfile-changed
|
||||
|
||||
steps:
|
||||
- name: Create manifest
|
||||
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
- name: Push manifest
|
||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
||||
59
.github/workflows/build_and_test.yml
vendored
59
.github/workflows/build_and_test.yml
vendored
@@ -44,7 +44,6 @@ jobs:
|
||||
|
||||
exit 1
|
||||
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -74,11 +73,19 @@ jobs:
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
check-codestyle-python:
|
||||
build-buildtools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/build_and_push_docker_image.yml
|
||||
with:
|
||||
dockerfile-path: Dockerfile.buildtools
|
||||
image-name: build-tools
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -108,10 +115,10 @@ jobs:
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions ]
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -175,10 +182,10 @@ jobs:
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-neon:
|
||||
needs: [ check-permissions, tag ]
|
||||
needs: [ check-permissions, tag, build-buildtools-image ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -332,16 +339,16 @@ jobs:
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run cargo test
|
||||
- name: Run rust tests
|
||||
run: |
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
@@ -351,7 +358,7 @@ jobs:
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
@@ -408,10 +415,10 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
needs: [ check-permissions, build-neon, tag ]
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
strategy:
|
||||
@@ -447,10 +454,10 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
benchmarks:
|
||||
needs: [ check-permissions, build-neon ]
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
@@ -479,12 +486,12 @@ jobs:
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -526,11 +533,10 @@ jobs:
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
needs: [ check-permissions, regress-tests ]
|
||||
|
||||
needs: [ check-permissions, regress-tests, build-buildtools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -694,7 +700,7 @@ jobs:
|
||||
}"
|
||||
|
||||
neon-image:
|
||||
needs: [ check-permissions, tag ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
@@ -733,6 +739,7 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
@@ -743,7 +750,7 @@ jobs:
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ check-permissions, tag ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
@@ -778,6 +785,7 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
@@ -788,7 +796,7 @@ jobs:
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, tag ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
@@ -836,6 +844,7 @@ jobs:
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
@@ -857,7 +866,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.19.0
|
||||
VM_BUILDER_VERSION: v0.21.0
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -218,7 +218,7 @@ jobs:
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
130
.github/workflows/update_build_tools_image.yml
vendored
Normal file
130
.github/workflows/update_build_tools_image.yml
vendored
Normal file
@@ -0,0 +1,130 @@
|
||||
name: 'Update build tools image tag'
|
||||
|
||||
# This workflow it used to update tag of build tools in ECR.
|
||||
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
to-tag:
|
||||
description: 'Destination tag'
|
||||
required: true
|
||||
type: string
|
||||
default: 'pinned'
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
outputs:
|
||||
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
|
||||
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Get source image digest
|
||||
id: next-digest
|
||||
run: |
|
||||
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
|
||||
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Get destination image digest (if already exists)
|
||||
id: prev-digest
|
||||
run: |
|
||||
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
|
||||
else
|
||||
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
|
||||
|
||||
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Tag image
|
||||
run: |
|
||||
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
|
||||
|
||||
rollback-tag-image:
|
||||
needs: tag-image
|
||||
if: ${{ !success() }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Restore previous tag if needed
|
||||
run: |
|
||||
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
|
||||
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
|
||||
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
# I guess we should delete the tag here/untag the image, but crane does not support it
|
||||
# - https://github.com/google/go-containerregistry/issues/999
|
||||
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
|
||||
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
|
||||
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
|
||||
|
||||
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
|
||||
else
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
|
||||
fi
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -6,6 +6,7 @@ __pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
.idea
|
||||
neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
|
||||
|
||||
@@ -70,3 +70,17 @@ We're using the following approach to make it work:
|
||||
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
|
||||
|
||||
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
|
||||
|
||||
## How do I add the "pinned" tag to an buildtools image?
|
||||
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
|
||||
|
||||
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
|
||||
or using GitHub CLI:
|
||||
|
||||
```bash
|
||||
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
|
||||
-f from-tag=6254913013 \
|
||||
-f to-tag=pinned \
|
||||
|
||||
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
|
||||
```
|
||||
103
Cargo.lock
generated
103
Cargo.lock
generated
@@ -1168,6 +1168,7 @@ dependencies = [
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rust-ini",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tar",
|
||||
@@ -1201,6 +1202,26 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
|
||||
|
||||
[[package]]
|
||||
name = "const-random"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
|
||||
dependencies = [
|
||||
"const-random-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const-random-macro"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
|
||||
dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
"once_cell",
|
||||
"tiny-keccak",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_fn"
|
||||
version = "0.4.9"
|
||||
@@ -1433,6 +1454,12 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crunchy"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
|
||||
|
||||
[[package]]
|
||||
name = "crypto-bigint"
|
||||
version = "0.4.9"
|
||||
@@ -1575,6 +1602,15 @@ dependencies = [
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dlv-list"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
|
||||
dependencies = [
|
||||
"const-random",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dyn-clone"
|
||||
version = "1.0.14"
|
||||
@@ -2106,6 +2142,20 @@ dependencies = [
|
||||
"hashbrown 0.13.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hdrhistogram"
|
||||
version = "7.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"byteorder",
|
||||
"crossbeam-channel",
|
||||
"flate2",
|
||||
"nom",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.8.0"
|
||||
@@ -3029,6 +3079,16 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
|
||||
dependencies = [
|
||||
"dlv-list",
|
||||
"hashbrown 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "os_info"
|
||||
version = "3.7.0"
|
||||
@@ -3057,6 +3117,28 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"futures",
|
||||
"hdrhistogram",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagectl"
|
||||
version = "0.1.0"
|
||||
@@ -4180,6 +4262,16 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-ini"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"ordered-multimap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.23"
|
||||
@@ -5134,6 +5226,15 @@ dependencies = [
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tiny-keccak"
|
||||
version = "2.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
|
||||
dependencies = [
|
||||
"crunchy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.2.1"
|
||||
@@ -6301,6 +6402,7 @@ dependencies = [
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper",
|
||||
@@ -6312,6 +6414,7 @@ dependencies = [
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"prost",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
|
||||
@@ -6,6 +6,7 @@ members = [
|
||||
"pageserver",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/pagebench",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
@@ -79,6 +80,7 @@ futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
|
||||
### inside this image in the real deployments.
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=rust
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
|
||||
# Build Postgres
|
||||
|
||||
166
Dockerfile.buildtools
Normal file
166
Dockerfile.buildtools
Normal file
@@ -0,0 +1,166 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# System deps
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install -y \
|
||||
autoconf \
|
||||
automake \
|
||||
bison \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
curl \
|
||||
flex \
|
||||
git \
|
||||
gnupg \
|
||||
gzip \
|
||||
jq \
|
||||
libcurl4-openssl-dev \
|
||||
libbz2-dev \
|
||||
libffi-dev \
|
||||
liblzma-dev \
|
||||
libncurses5-dev \
|
||||
libncursesw5-dev \
|
||||
libpq-dev \
|
||||
libreadline-dev \
|
||||
libseccomp-dev \
|
||||
libsqlite3-dev \
|
||||
libssl-dev \
|
||||
libstdc++-10-dev \
|
||||
libtool \
|
||||
libxml2-dev \
|
||||
libxmlsec1-dev \
|
||||
libxxhash-dev \
|
||||
lsof \
|
||||
make \
|
||||
netcat \
|
||||
net-tools \
|
||||
openssh-client \
|
||||
parallel \
|
||||
pkg-config \
|
||||
unzip \
|
||||
wget \
|
||||
xz-utils \
|
||||
zlib1g-dev \
|
||||
zstd \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION 25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
&& unzip -q protoc.zip -d protoc \
|
||||
&& mv protoc/bin/protoc /usr/local/bin/protoc \
|
||||
&& mv protoc/include/google /usr/local/include/google \
|
||||
&& rm -rf protoc.zip protoc
|
||||
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=17
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# PostgreSQL 14
|
||||
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
|
||||
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
|
||||
&& apt update \
|
||||
&& apt install -y postgresql-client-14 \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# AWS CLI
|
||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
||||
&& unzip -q awscliv2.zip \
|
||||
&& ./aws/install \
|
||||
&& rm awscliv2.zip
|
||||
|
||||
# Mold: A Modern Linker
|
||||
ENV MOLD_VERSION v2.4.0
|
||||
RUN set -e \
|
||||
&& git clone https://github.com/rui314/mold.git \
|
||||
&& mkdir mold/build \
|
||||
&& cd mold/build \
|
||||
&& git checkout ${MOLD_VERSION} \
|
||||
&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
|
||||
&& cmake --build . -j $(nproc) \
|
||||
&& cmake --install . \
|
||||
&& cd .. \
|
||||
&& rm -rf mold
|
||||
|
||||
# LCOV
|
||||
# Build lcov from a fork:
|
||||
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
|
||||
# And patches from us:
|
||||
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
|
||||
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
|
||||
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
||||
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
|
||||
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
|
||||
&& cd lcov \
|
||||
&& make install \
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Switch to nonroot user
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Python
|
||||
ENV PYTHON_VERSION=3.9.2 \
|
||||
PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
|
||||
RUN set -e \
|
||||
&& cd $HOME \
|
||||
&& curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
|
||||
&& chmod +x pyenv-installer \
|
||||
&& ./pyenv-installer \
|
||||
&& export PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
&& export PATH="$PYENV_ROOT/bin:$PATH" \
|
||||
&& export PATH="$PYENV_ROOT/shims:$PATH" \
|
||||
&& pyenv install ${PYTHON_VERSION} \
|
||||
&& pyenv global ${PYTHON_VERSION} \
|
||||
&& python --version \
|
||||
&& pip install --upgrade pip \
|
||||
&& pip --version \
|
||||
&& pip install pipenv wheel poetry
|
||||
|
||||
# Switch to nonroot user (again)
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.74.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
|
||||
rm rustup-init && \
|
||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
ENV RUSTC_WRAPPER=cachepot
|
||||
|
||||
# Show versions
|
||||
RUN whoami \
|
||||
&& python --version \
|
||||
&& pip --version \
|
||||
&& cargo --version --verbose \
|
||||
&& rustup --version --verbose \
|
||||
&& rustc --version --verbose \
|
||||
&& clang --version
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=rust
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
@@ -48,7 +48,29 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=rust
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
|
||||
@@ -39,3 +39,4 @@ remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.13"
|
||||
bytes = "1.0"
|
||||
rust-ini = "0.20.0"
|
||||
|
||||
@@ -31,7 +31,9 @@
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
|
||||
//! --pgbouncer-ini-path /etc/pgbouncer.ini \
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -99,6 +101,9 @@ fn main() -> Result<()> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
|
||||
let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
@@ -209,6 +214,8 @@ fn main() -> Result<()> {
|
||||
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
build_tag,
|
||||
pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
|
||||
pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -493,6 +500,23 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-connstr")
|
||||
.long("pgbouncer-connstr")
|
||||
.default_value(
|
||||
"host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
|
||||
)
|
||||
.value_name("PGBOUNCER_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-ini-path")
|
||||
.long("pgbouncer-ini-path")
|
||||
// Note: this doesn't match current path for pgbouncer.ini.
|
||||
// Until we fix it, we need to pass the path explicitly
|
||||
// or this will be effectively no-op.
|
||||
.default_value("/etc/pgbouncer.ini")
|
||||
.value_name("PGBOUNCER_INI_PATH"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -64,6 +65,10 @@ pub struct ComputeNode {
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
// connection string to pgbouncer to change settings
|
||||
pub pgbouncer_connstr: Option<String>,
|
||||
// path to pgbouncer.ini to change settings
|
||||
pub pgbouncer_ini_path: Option<String>,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
@@ -737,6 +742,31 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
@@ -791,6 +821,32 @@ impl ComputeNode {
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
// tune pgbouncer
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
info!(
|
||||
"start_compute spec.remote_extensions {:?}",
|
||||
pspec.spec.remote_extensions
|
||||
|
||||
@@ -9,9 +9,11 @@ use std::process::Child;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use ini::Ini;
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
use postgres::{Client, Transaction};
|
||||
use tracing::{debug, instrument};
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::{debug, error, info, instrument};
|
||||
|
||||
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
@@ -359,3 +361,68 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update pgbouncer.ini with provided options
|
||||
pub fn update_pgbouncer_ini(
|
||||
pgbouncer_config: HashMap<String, String>,
|
||||
pgbouncer_ini_path: &str,
|
||||
) -> Result<()> {
|
||||
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
|
||||
let section = conf.section_mut(Some("pgbouncer")).unwrap();
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
section.insert(option_name, value);
|
||||
}
|
||||
|
||||
conf.write_to_file(pgbouncer_ini_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tune pgbouncer.
|
||||
/// 1. Apply new config using pgbouncer admin console
|
||||
/// 2. Add new values to pgbouncer.ini to preserve them after restart
|
||||
pub async fn tune_pgbouncer(
|
||||
pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
pgbouncer_connstr: &str,
|
||||
pgbouncer_ini_path: Option<String>,
|
||||
) -> Result<()> {
|
||||
if let Some(pgbouncer_config) = pgbouncer_settings {
|
||||
// Apply new config
|
||||
let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
|
||||
let (client, connection) = connect_result.unwrap();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
info!(
|
||||
"Applying pgbouncer setting change: {} = {}",
|
||||
option_name, value
|
||||
);
|
||||
let query = format!("SET {} = {}", option_name, value);
|
||||
|
||||
let result = client.simple_query(&query).await;
|
||||
|
||||
info!("Applying pgbouncer setting change: {}", query);
|
||||
info!("pgbouncer setting change result: {:?}", result);
|
||||
|
||||
if let Err(err) = result {
|
||||
// Don't fail on error, just print it into log
|
||||
error!(
|
||||
"Failed to apply pgbouncer setting change: {}, {}",
|
||||
query, err
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
// save values to pgbouncer.ini
|
||||
// so that they are preserved after pgbouncer restart
|
||||
if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
|
||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -370,33 +370,49 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn reassign_owned_objects_in_one_db(
|
||||
conf: Config,
|
||||
role_name: &PgIdent,
|
||||
db_owner: &PgIdent,
|
||||
) -> Result<()> {
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.pg_quote(),
|
||||
db_owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name,
|
||||
conf.get_dbname().unwrap_or(""),
|
||||
db_owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Reassign all owned objects in all databases to the owner of the database.
|
||||
fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
|
||||
for db in &spec.cluster.databases {
|
||||
if db.owner != *role_name {
|
||||
let mut conf = Config::from_str(connstr)?;
|
||||
conf.dbname(&db.name);
|
||||
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.pg_quote(),
|
||||
db.owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name, &db.name, &db.owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Also handle case when there are no databases in the spec.
|
||||
// In this case we need to reassign objects in the default database.
|
||||
let conf = Config::from_str(connstr)?;
|
||||
let db_owner = PgIdent::from_str("cloud_admin")?;
|
||||
reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -537,6 +537,7 @@ impl Endpoint {
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
|
||||
@@ -35,6 +35,7 @@ allow = [
|
||||
"Artistic-2.0",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"CC0-1.0",
|
||||
"ISC",
|
||||
"MIT",
|
||||
"MPL-2.0",
|
||||
|
||||
@@ -73,6 +73,8 @@ pub struct ComputeSpec {
|
||||
|
||||
// information about available remote extensions
|
||||
pub remote_extensions: Option<RemoteExtSpec>,
|
||||
|
||||
pub pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -243,5 +243,9 @@
|
||||
"public_extensions": [
|
||||
"postgis"
|
||||
]
|
||||
},
|
||||
"pgbouncer_settings": {
|
||||
"default_pool_size": "42",
|
||||
"pool_mode": "session"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,6 +81,10 @@ impl TenantShardId {
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
@@ -159,7 +163,7 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||
/// TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
|
||||
@@ -117,6 +117,8 @@ impl AzureBlobStorage {
|
||||
) -> Result<Download, DownloadError> {
|
||||
let mut response = builder.into_stream();
|
||||
|
||||
let mut etag = None;
|
||||
let mut last_modified = None;
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
@@ -124,6 +126,13 @@ impl AzureBlobStorage {
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
@@ -136,6 +145,8 @@ impl AzureBlobStorage {
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
last_modified,
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -14,7 +14,9 @@ mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
|
||||
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
|
||||
use std::{
|
||||
collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -207,8 +209,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
pub struct Download {
|
||||
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
pub last_modified: Option<SystemTime>,
|
||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||
pub etag: Option<String>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
|
||||
use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -331,6 +331,8 @@ impl RemoteStorage for LocalFs {
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
@@ -372,17 +374,17 @@ impl RemoteStorage for LocalFs {
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(match end_exclusive {
|
||||
Some(end_exclusive) => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
},
|
||||
None => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(ReaderStream::new(source)),
|
||||
},
|
||||
let download_stream: DownloadStream = match end_exclusive {
|
||||
Some(end_exclusive) => Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
None => Box::pin(ReaderStream::new(source)),
|
||||
};
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream,
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
|
||||
@@ -16,6 +16,7 @@ use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
meta::credentials::CredentialsProviderChain,
|
||||
profile::ProfileFileCredentialsProvider,
|
||||
provider_config::ProviderConfig,
|
||||
retry::{RetryConfigBuilder, RetryMode},
|
||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
@@ -74,20 +75,29 @@ impl S3Bucket {
|
||||
|
||||
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
||||
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
.or_else(
|
||||
"profile-sso",
|
||||
ProfileFileCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else("token", {
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
.or_else(
|
||||
"token",
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build()
|
||||
})
|
||||
.build(),
|
||||
)
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
@@ -218,17 +228,11 @@ impl S3Bucket {
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
if get_object.is_err() {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
}
|
||||
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output.e_tag.clone();
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
@@ -237,15 +241,33 @@ impl S3Bucket {
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
etag,
|
||||
last_modified,
|
||||
download_stream: Box::pin(body),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
Err(e) => Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
)),
|
||||
Err(e) => {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -366,6 +366,49 @@ impl MonotonicCounter<Lsn> for RecordLsn {
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
|
||||
///
|
||||
/// This is used by the `pagebench` pageserver benchmarking tool.
|
||||
pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
|
||||
|
||||
impl rand::distributions::uniform::SampleUniform for Lsn {
|
||||
type Sampler = LsnSampler;
|
||||
}
|
||||
|
||||
impl rand::distributions::uniform::UniformSampler for LsnSampler {
|
||||
type X = Lsn;
|
||||
|
||||
fn new<B1, B2>(low: B1, high: B2) -> Self
|
||||
where
|
||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
{
|
||||
Self(
|
||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
|
||||
low.borrow().0,
|
||||
high.borrow().0,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
|
||||
where
|
||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
{
|
||||
Self(
|
||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
|
||||
low.borrow().0,
|
||||
high.borrow().0,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
|
||||
Lsn(self.0.sample(rng))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bin_ser::BeSer;
|
||||
|
||||
@@ -8,12 +8,12 @@ use std::ffi::CString;
|
||||
|
||||
use crate::bindings::uint32;
|
||||
use crate::bindings::walproposer_api;
|
||||
use crate::bindings::NeonWALReadResult;
|
||||
use crate::bindings::PGAsyncReadResult;
|
||||
use crate::bindings::PGAsyncWriteResult;
|
||||
use crate::bindings::Safekeeper;
|
||||
use crate::bindings::Size;
|
||||
use crate::bindings::StringInfoData;
|
||||
use crate::bindings::TimeLineID;
|
||||
use crate::bindings::TimestampTz;
|
||||
use crate::bindings::WalProposer;
|
||||
use crate::bindings::WalProposerConnStatusType;
|
||||
@@ -178,31 +178,11 @@ extern "C" fn conn_blocking_write(
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn recovery_download(
|
||||
sk: *mut Safekeeper,
|
||||
_timeline: TimeLineID,
|
||||
startpos: XLogRecPtr,
|
||||
endpos: XLogRecPtr,
|
||||
) -> bool {
|
||||
extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).recovery_download(&mut (*sk), startpos, endpos)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
) {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
(*api).recovery_download(&mut (*wp), &mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -214,11 +194,28 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn free_event_set(wp: *mut WalProposer) {
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
_errmsg: *mut *mut ::std::os::raw::c_char,
|
||||
) -> NeonWALReadResult {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).free_event_set(&mut (*wp));
|
||||
// TODO: errmsg is not forwarded
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_reader_events(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -238,6 +235,14 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).active_state_update_event_set(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
@@ -246,6 +251,14 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).rm_safekeeper_event_set(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wait_event_set(
|
||||
wp: *mut WalProposer,
|
||||
timeout: ::std::os::raw::c_long,
|
||||
@@ -313,14 +326,6 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).confirm_wal_streamed(&mut (*wp), lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn log_internal(
|
||||
wp: *mut WalProposer,
|
||||
level: ::std::os::raw::c_int,
|
||||
@@ -335,14 +340,6 @@ extern "C" fn log_internal(
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn after_election(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).after_election(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Level {
|
||||
Debug5,
|
||||
@@ -401,20 +398,20 @@ pub(crate) fn create_api() -> walproposer_api {
|
||||
conn_async_write: Some(conn_async_write),
|
||||
conn_blocking_write: Some(conn_blocking_write),
|
||||
recovery_download: Some(recovery_download),
|
||||
wal_read: Some(wal_read),
|
||||
wal_reader_allocate: Some(wal_reader_allocate),
|
||||
free_event_set: Some(free_event_set),
|
||||
wal_read: Some(wal_read),
|
||||
wal_reader_events: Some(wal_reader_events),
|
||||
init_event_set: Some(init_event_set),
|
||||
update_event_set: Some(update_event_set),
|
||||
active_state_update_event_set: Some(active_state_update_event_set),
|
||||
add_safekeeper_event_set: Some(add_safekeeper_event_set),
|
||||
rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
|
||||
wait_event_set: Some(wait_event_set),
|
||||
strong_random: Some(strong_random),
|
||||
get_redo_start_lsn: Some(get_redo_start_lsn),
|
||||
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
||||
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
||||
confirm_wal_streamed: Some(confirm_wal_streamed),
|
||||
log_internal: Some(log_internal),
|
||||
after_election: Some(after_election),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
|
||||
use crate::{
|
||||
api_bindings::{create_api, take_vec_u8, Level},
|
||||
bindings::{
|
||||
Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
|
||||
WalProposerStart,
|
||||
NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
|
||||
WalProposerFree, WalProposerStart,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -86,19 +86,19 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
|
||||
fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn free_event_set(&self, _wp: &mut WalProposer) {
|
||||
fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
@@ -110,10 +110,18 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
|
||||
todo!()
|
||||
}
|
||||
@@ -134,10 +142,6 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
|
||||
todo!()
|
||||
}
|
||||
@@ -240,6 +244,7 @@ impl Drop for Wrapper {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use core::panic;
|
||||
use std::{
|
||||
cell::Cell,
|
||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||
@@ -247,7 +252,7 @@ mod tests {
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, walproposer::Wrapper};
|
||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
@@ -355,12 +360,17 @@ mod tests {
|
||||
true
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("wal_reader_allocate")
|
||||
fn recovery_download(
|
||||
&self,
|
||||
_wp: &mut crate::bindings::WalProposer,
|
||||
_sk: &mut crate::bindings::Safekeeper,
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("free_event_set")
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
|
||||
println!("wal_reader_allocate");
|
||||
crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
@@ -383,6 +393,13 @@ mod tests {
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
|
||||
println!(
|
||||
"rm_safekeeper_event_set, sk={:?}",
|
||||
sk as *mut crate::bindings::Safekeeper
|
||||
);
|
||||
}
|
||||
|
||||
fn wait_event_set(
|
||||
&self,
|
||||
_: &mut crate::bindings::WalProposer,
|
||||
|
||||
@@ -5,6 +5,8 @@ use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
pub mod util;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Client {
|
||||
mgmt_api_endpoint: String,
|
||||
|
||||
49
pageserver/client/src/mgmt_api/util.rs
Normal file
49
pageserver/client/src/mgmt_api/util.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
//! Helpers to do common higher-level tasks with the [`Client`].
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::{TenantId, TenantTimelineId};
|
||||
|
||||
use super::Client;
|
||||
|
||||
/// Retrieve a list of all of the pageserver's timelines.
|
||||
///
|
||||
/// Fails if there are sharded tenants present on the pageserver.
|
||||
pub async fn get_pageserver_tenant_timelines_unsharded(
|
||||
api_client: &Arc<Client>,
|
||||
) -> anyhow::Result<Vec<TenantTimelineId>> {
|
||||
let mut timelines: Vec<TenantTimelineId> = Vec::new();
|
||||
let mut tenants: Vec<TenantId> = Vec::new();
|
||||
for ti in api_client.list_tenants().await? {
|
||||
if !ti.id.is_unsharded() {
|
||||
anyhow::bail!(
|
||||
"only unsharded tenants are supported at this time: {}",
|
||||
ti.id
|
||||
);
|
||||
}
|
||||
tenants.push(ti.id.tenant_id)
|
||||
}
|
||||
let mut js = JoinSet::new();
|
||||
for tenant_id in tenants {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(api_client);
|
||||
async move {
|
||||
(
|
||||
tenant_id,
|
||||
mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
|
||||
)
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(res) = js.join_next().await {
|
||||
let (tenant_id, details) = res.unwrap();
|
||||
for timeline_id in details.timelines {
|
||||
timelines.push(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(timelines)
|
||||
}
|
||||
26
pageserver/pagebench/Cargo.toml
Normal file
26
pageserver/pagebench/Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[package]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
hdrhistogram.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio.workspace = true
|
||||
|
||||
pageserver = { path = ".." }
|
||||
pageserver_client.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
utils = { path = "../../libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
272
pageserver/pagebench/src/cmd/basebackup.rs
Normal file
272
pageserver/pagebench/src/cmd/basebackup.rs
Normal file
@@ -0,0 +1,272 @@
|
||||
use anyhow::Context;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, info, instrument};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
|
||||
/// basebackup@LatestLSN
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long, default_value = "1.0")]
|
||||
gzip_probability: f64,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
struct Target {
|
||||
timeline: TenantTimelineId,
|
||||
lsn_range: Option<Range<Lsn>>,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: request_stats::Output,
|
||||
}
|
||||
|
||||
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
|
||||
main_impl(args, thread_local_stats)
|
||||
})
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let mut js = JoinSet::new();
|
||||
for timeline in &timelines {
|
||||
js.spawn({
|
||||
let timeline = *timeline;
|
||||
// FIXME: this triggers initial logical size calculation
|
||||
// https://github.com/neondatabase/neon/issues/6168
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
async move {
|
||||
anyhow::Ok(Target {
|
||||
timeline,
|
||||
// TODO: support lsn_range != latest LSN
|
||||
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
|
||||
})
|
||||
}
|
||||
});
|
||||
}
|
||||
let mut all_targets: Vec<Target> = Vec::new();
|
||||
while let Some(res) = js.join_next().await {
|
||||
all_targets.push(res.unwrap().unwrap());
|
||||
}
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
|
||||
));
|
||||
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
|
||||
|
||||
tokio::spawn({
|
||||
let stats = Arc::clone(&live_stats);
|
||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
||||
async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let mut work_senders = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender = async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let (timeline, work) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let target = all_targets.choose(&mut rng).unwrap();
|
||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
||||
(
|
||||
target.timeline,
|
||||
Work {
|
||||
lsn,
|
||||
gzip: rng.gen_bool(args.gzip_probability),
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
sender.send(work).await.ok().unwrap();
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(runtime) = args.runtime {
|
||||
match tokio::time::timeout(runtime.into(), work_sender).await {
|
||||
Ok(()) => unreachable!("work sender never terminates"),
|
||||
Err(_timeout) => {
|
||||
// this implicitly drops the work_senders, making all the clients exit
|
||||
}
|
||||
}
|
||||
} else {
|
||||
work_sender.await;
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = request_stats::Stats::new();
|
||||
for stats in all_thread_local_stats.lock().unwrap().iter() {
|
||||
let stats = stats.lock().unwrap();
|
||||
agg_stats.add(&stats);
|
||||
}
|
||||
agg_stats.output()
|
||||
},
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct Work {
|
||||
lsn: Option<Lsn>,
|
||||
gzip: bool,
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
|
||||
&args.page_service_host_port,
|
||||
args.pageserver_jwt.as_deref(),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
let copy_out_stream = client
|
||||
.basebackup(&BasebackupRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
})
|
||||
.await
|
||||
.with_context(|| format!("start basebackup for {timeline}"))
|
||||
.unwrap();
|
||||
|
||||
use futures::StreamExt;
|
||||
let size = Arc::new(AtomicUsize::new(0));
|
||||
copy_out_stream
|
||||
.for_each({
|
||||
|r| {
|
||||
let size = Arc::clone(&size);
|
||||
async move {
|
||||
let size = Arc::clone(&size);
|
||||
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
335
pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
Normal file
335
pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
Normal file
@@ -0,0 +1,335 @@
|
||||
use anyhow::Context;
|
||||
use futures::future::join_all;
|
||||
use pageserver::pgdatadir_mapping::key_to_rel_block;
|
||||
use pageserver::repository;
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_client::page_service::RelTagBlockNo;
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long)]
|
||||
per_target_rate_limit: Option<usize>,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct KeyRange {
|
||||
timeline: TenantTimelineId,
|
||||
timeline_lsn: Lsn,
|
||||
start: i128,
|
||||
end: i128,
|
||||
}
|
||||
|
||||
impl KeyRange {
|
||||
fn len(&self) -> i128 {
|
||||
self.end - self.start
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: request_stats::Output,
|
||||
}
|
||||
|
||||
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
|
||||
main_impl(args, thread_local_stats)
|
||||
})
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut js = JoinSet::new();
|
||||
for timeline in &timelines {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
let timeline = *timeline;
|
||||
async move {
|
||||
let partitioning = mgmt_api_client
|
||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
||||
.await?;
|
||||
let lsn = partitioning.at_lsn;
|
||||
|
||||
let ranges = partitioning
|
||||
.keys
|
||||
.ranges
|
||||
.iter()
|
||||
.filter_map(|r| {
|
||||
let start = r.start;
|
||||
let end = r.end;
|
||||
// filter out non-relblock keys
|
||||
match (is_rel_block_key(&start), is_rel_block_key(&end)) {
|
||||
(true, true) => Some(KeyRange {
|
||||
timeline,
|
||||
timeline_lsn: lsn,
|
||||
start: start.to_i128(),
|
||||
end: end.to_i128(),
|
||||
}),
|
||||
(true, false) | (false, true) => {
|
||||
unimplemented!("split up range")
|
||||
}
|
||||
(false, false) => None,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
anyhow::Ok(ranges)
|
||||
}
|
||||
});
|
||||
}
|
||||
let mut all_ranges: Vec<KeyRange> = Vec::new();
|
||||
while let Some(res) = js.join_next().await {
|
||||
all_ranges.extend(res.unwrap().unwrap());
|
||||
}
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
|
||||
));
|
||||
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
|
||||
|
||||
tokio::spawn({
|
||||
let stats = Arc::clone(&live_stats);
|
||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
||||
async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let mut work_senders = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
|
||||
None => Box::pin(async move {
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
all_ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
loop {
|
||||
let (range, key) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &all_ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
(r, RelTagBlockNo { rel_tag, block_no })
|
||||
};
|
||||
let sender = work_senders.get(&range.timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
|
||||
}
|
||||
}),
|
||||
Some(rps_limit) => Box::pin(async move {
|
||||
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
|
||||
|
||||
let make_timeline_task: &dyn Fn(
|
||||
TenantTimelineId,
|
||||
)
|
||||
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Box::pin(async move {
|
||||
let mut ticker = tokio::time::interval(period);
|
||||
ticker.set_missed_tick_behavior(
|
||||
/* TODO review this choice */
|
||||
tokio::time::MissedTickBehavior::Burst,
|
||||
);
|
||||
loop {
|
||||
ticker.tick().await;
|
||||
let (range, key) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
(r, RelTagBlockNo { rel_tag, block_no })
|
||||
};
|
||||
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
let tasks: Vec<_> = work_senders
|
||||
.keys()
|
||||
.map(|tl| make_timeline_task(**tl))
|
||||
.collect();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
join_all(tasks).await;
|
||||
}),
|
||||
};
|
||||
|
||||
if let Some(runtime) = args.runtime {
|
||||
match tokio::time::timeout(runtime.into(), work_sender).await {
|
||||
Ok(()) => unreachable!("work sender never terminates"),
|
||||
Err(_timeout) => {
|
||||
// this implicitly drops the work_senders, making all the clients exit
|
||||
}
|
||||
}
|
||||
} else {
|
||||
work_sender.await;
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = request_stats::Stats::new();
|
||||
for stats in all_thread_local_stats.lock().unwrap().iter() {
|
||||
let stats = stats.lock().unwrap();
|
||||
agg_stats.add(&stats);
|
||||
}
|
||||
agg_stats.output()
|
||||
},
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some((key, lsn)) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
client
|
||||
.getpage(key, lsn)
|
||||
.await
|
||||
.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use humantime::Duration;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(
|
||||
long,
|
||||
help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
|
||||
)]
|
||||
poll_for_completion: Option<Duration>,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(args));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
// kick it off
|
||||
|
||||
let mut js = JoinSet::new();
|
||||
for tl in timelines {
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
js.spawn(async move {
|
||||
// TODO: API to explicitly trigger initial logical size computation.
|
||||
// Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
|
||||
// => https://github.com/neondatabase/neon/issues/6168
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
if let Some(period) = args.poll_for_completion {
|
||||
let mut ticker = tokio::time::interval(period.into());
|
||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
||||
let mut info = info;
|
||||
while !info.current_logical_size_is_accurate {
|
||||
ticker.tick().await;
|
||||
info = mgmt_api_client
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(res) = js.join_next().await {
|
||||
let _: () = res.unwrap();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
48
pageserver/pagebench/src/main.rs
Normal file
48
pageserver/pagebench/src/main.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
use clap::Parser;
|
||||
use utils::logging;
|
||||
|
||||
/// Re-usable pieces of code that aren't CLI-specific.
|
||||
mod util {
|
||||
pub(crate) mod connstring;
|
||||
pub(crate) mod request_stats;
|
||||
#[macro_use]
|
||||
pub(crate) mod tokio_thread_local_stats;
|
||||
/// Re-usable pieces of CLI-specific code.
|
||||
pub(crate) mod cli {
|
||||
pub(crate) mod targets;
|
||||
}
|
||||
}
|
||||
|
||||
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
||||
mod cmd {
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod trigger_initial_size_calculation;
|
||||
}
|
||||
|
||||
/// Component-level performance test for pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
enum Args {
|
||||
Basebackup(cmd::basebackup::Args),
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let args = Args::parse();
|
||||
match args {
|
||||
Args::Basebackup(args) => cmd::basebackup::main(args),
|
||||
Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
|
||||
Args::TriggerInitialSizeCalculation(args) => {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
34
pageserver/pagebench/src/util/cli/targets.rs
Normal file
34
pageserver/pagebench/src/util/cli/targets.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use pageserver_client::mgmt_api;
|
||||
use tracing::info;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
pub(crate) struct Spec {
|
||||
pub(crate) limit_to_first_n_targets: Option<usize>,
|
||||
pub(crate) targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) async fn discover(
|
||||
api_client: &Arc<mgmt_api::Client>,
|
||||
spec: Spec,
|
||||
) -> anyhow::Result<Vec<TenantTimelineId>> {
|
||||
let mut timelines = if let Some(targets) = spec.targets {
|
||||
targets
|
||||
} else {
|
||||
mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
|
||||
};
|
||||
|
||||
if let Some(limit) = spec.limit_to_first_n_targets {
|
||||
timelines.sort(); // for determinism
|
||||
timelines.truncate(limit);
|
||||
if timelines.len() < limit {
|
||||
anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
|
||||
}
|
||||
}
|
||||
|
||||
info!("timelines:\n{:?}", timelines);
|
||||
info!("number of timelines:\n{:?}", timelines.len());
|
||||
|
||||
Ok(timelines)
|
||||
}
|
||||
8
pageserver/pagebench/src/util/connstring.rs
Normal file
8
pageserver/pagebench/src/util/connstring.rs
Normal file
@@ -0,0 +1,8 @@
|
||||
pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
|
||||
let colon_and_jwt = if let Some(jwt) = jwt {
|
||||
format!(":{jwt}") // TODO: urlescape
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
format!("postgres://postgres{colon_and_jwt}@{host_port}")
|
||||
}
|
||||
88
pageserver/pagebench/src/util/request_stats.rs
Normal file
88
pageserver/pagebench/src/util/request_stats.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
pub(crate) struct Stats {
|
||||
latency_histo: hdrhistogram::Histogram<u64>,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
// Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
|
||||
// which would skew the benchmark results.
|
||||
latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
|
||||
let micros: u64 = latency
|
||||
.as_micros()
|
||||
.try_into()
|
||||
.context("latency greater than u64")?;
|
||||
self.latency_histo
|
||||
.record(micros)
|
||||
.context("add to histogram")?;
|
||||
Ok(())
|
||||
}
|
||||
pub(crate) fn output(&self) -> Output {
|
||||
let latency_percentiles = std::array::from_fn(|idx| {
|
||||
let micros = self
|
||||
.latency_histo
|
||||
.value_at_percentile(LATENCY_PERCENTILES[idx]);
|
||||
Duration::from_micros(micros)
|
||||
});
|
||||
Output {
|
||||
request_count: self.latency_histo.len(),
|
||||
latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
|
||||
latency_percentiles: LatencyPercentiles {
|
||||
latency_percentiles,
|
||||
},
|
||||
}
|
||||
}
|
||||
pub(crate) fn add(&mut self, other: &Self) {
|
||||
let Self {
|
||||
ref mut latency_histo,
|
||||
} = self;
|
||||
latency_histo.add(&other.latency_histo).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Stats {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
|
||||
|
||||
struct LatencyPercentiles {
|
||||
latency_percentiles: [Duration; 4],
|
||||
}
|
||||
|
||||
impl serde::Serialize for LatencyPercentiles {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
|
||||
for p in LATENCY_PERCENTILES {
|
||||
ser.serialize_entry(
|
||||
&format!("p{p}"),
|
||||
&format!(
|
||||
"{}",
|
||||
&humantime::format_duration(self.latency_percentiles[0])
|
||||
),
|
||||
)?;
|
||||
}
|
||||
ser.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub(crate) struct Output {
|
||||
request_count: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
latency_mean: Duration,
|
||||
latency_percentiles: LatencyPercentiles,
|
||||
}
|
||||
45
pageserver/pagebench/src/util/tokio_thread_local_stats.rs
Normal file
45
pageserver/pagebench/src/util/tokio_thread_local_stats.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
|
||||
pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
|
||||
|
||||
macro_rules! declare {
|
||||
($THREAD_LOCAL_NAME:ident: $T:ty) => {
|
||||
thread_local! {
|
||||
pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
|
||||
std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
|
||||
);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
pub(crate) use declare;
|
||||
|
||||
macro_rules! main {
|
||||
($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
|
||||
let main_impl = $main_impl;
|
||||
let all = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.on_thread_start({
|
||||
let all = Arc::clone(&all);
|
||||
move || {
|
||||
// pre-initialize the thread local stats by accessesing them
|
||||
// (some stats like requests_stats::Stats are quite costly to initialize,
|
||||
// we don't want to pay that cost during the measurement period)
|
||||
$THREAD_LOCAL_NAME.with(|stats| {
|
||||
let stats: Arc<_> = Arc::clone(&*stats.borrow());
|
||||
all.lock().unwrap().push(stats);
|
||||
});
|
||||
}
|
||||
})
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(all));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}};
|
||||
}
|
||||
|
||||
pub(crate) use main;
|
||||
@@ -1468,6 +1468,7 @@ threshold = "20m"
|
||||
period: Duration::from_secs(10),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
|
||||
})
|
||||
);
|
||||
match &conf.default_tenant_conf.eviction_policy {
|
||||
|
||||
@@ -74,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig {
|
||||
pub period: Duration,
|
||||
#[cfg(feature = "testing")]
|
||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
||||
/// Select sorting for evicted layers
|
||||
#[serde(default)]
|
||||
pub eviction_order: EvictionOrder,
|
||||
}
|
||||
|
||||
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
|
||||
/// partitioning.
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", content = "args")]
|
||||
pub enum EvictionOrder {
|
||||
/// Order the layers to be evicted by how recently they have been accessed in absolute
|
||||
/// time.
|
||||
///
|
||||
/// This strategy is unfair when some tenants grow faster than others towards the slower
|
||||
/// growing.
|
||||
#[default]
|
||||
AbsoluteAccessed,
|
||||
|
||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
||||
/// the set of resident layers of a tenant.
|
||||
///
|
||||
/// This strategy will evict layers more fairly but is untested.
|
||||
RelativeAccessed {
|
||||
#[serde(default)]
|
||||
highest_layer_count_loses_first: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl EvictionOrder {
|
||||
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
|
||||
/// counts should be the first ones to have their layers evicted.
|
||||
fn highest_layer_count_loses_first(&self) -> bool {
|
||||
match self {
|
||||
EvictionOrder::AbsoluteAccessed => false,
|
||||
EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first,
|
||||
} => *highest_layer_count_loses_first,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -192,7 +231,14 @@ async fn disk_usage_eviction_task_iteration(
|
||||
) -> anyhow::Result<()> {
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||
let res = disk_usage_eviction_task_iteration_impl(
|
||||
state,
|
||||
storage,
|
||||
usage_pre,
|
||||
task_config.eviction_order,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
@@ -278,6 +324,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
state: &State,
|
||||
_storage: &GenericRemoteStorage,
|
||||
usage_pre: U,
|
||||
eviction_order: EvictionOrder,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<IterationOutcome<U>> {
|
||||
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
||||
@@ -297,7 +344,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
"running disk usage based eviction due to pressure"
|
||||
);
|
||||
|
||||
let candidates = match collect_eviction_candidates(cancel).await? {
|
||||
let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
|
||||
EvictionCandidates::Cancelled => {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
@@ -307,16 +354,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// Debug-log the list of candidates
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
let nth = i + 1;
|
||||
let desc = candidate.layer.layer_desc();
|
||||
let total_candidates = candidates.len();
|
||||
let size = desc.file_size;
|
||||
let rel = candidate.relative_last_activity;
|
||||
debug!(
|
||||
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
desc.file_size,
|
||||
"cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
|
||||
now.duration_since(candidate.last_activity_ts)
|
||||
.unwrap()
|
||||
.as_micros(),
|
||||
partition,
|
||||
desc.tenant_shard_id,
|
||||
desc.timeline_id,
|
||||
candidate.layer,
|
||||
@@ -459,6 +506,7 @@ struct EvictionCandidate {
|
||||
timeline: Arc<Timeline>,
|
||||
layer: Layer,
|
||||
last_activity_ts: SystemTime,
|
||||
relative_last_activity: finite_f32::FiniteF32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -478,24 +526,24 @@ enum EvictionCandidates {
|
||||
/// order. A caller that evicts in that order, until pressure is relieved, implements
|
||||
/// the eviction policy outlined in the module comment.
|
||||
///
|
||||
/// # Example
|
||||
/// # Example with EvictionOrder::AbsoluteAccessed
|
||||
///
|
||||
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
|
||||
/// Each layer has size 100, and both tenant's min_resident_size is 150.
|
||||
/// The eviction order would be
|
||||
///
|
||||
/// ```text
|
||||
/// partition last_activity_ts tenant/layer
|
||||
/// Above 18:30 A/c
|
||||
/// Above 19:00 A/b
|
||||
/// Above 18:29 B/c
|
||||
/// Above 19:05 B/b
|
||||
/// Above 20:00 B/a
|
||||
/// Above 20:03 A/a
|
||||
/// Below 20:30 A/d
|
||||
/// Below 20:40 B/d
|
||||
/// Below 20:45 B/e
|
||||
/// Below 20:58 A/e
|
||||
/// partition last_activity_ts tenant/layer
|
||||
/// Above 18:30 A/c
|
||||
/// Above 19:00 A/b
|
||||
/// Above 18:29 B/c
|
||||
/// Above 19:05 B/b
|
||||
/// Above 20:00 B/a
|
||||
/// Above 20:03 A/a
|
||||
/// Below 20:30 A/d
|
||||
/// Below 20:40 B/d
|
||||
/// Below 20:45 B/e
|
||||
/// Below 20:58 A/e
|
||||
/// ```
|
||||
///
|
||||
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
|
||||
@@ -505,7 +553,77 @@ enum EvictionCandidates {
|
||||
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
|
||||
/// after exhauting the `Above` partition.
|
||||
/// So, we did not respect each tenant's min_resident_size.
|
||||
///
|
||||
/// # Example with EvictionOrder::RelativeAccessed
|
||||
///
|
||||
/// ```text
|
||||
/// partition relative_age last_activity_ts tenant/layer
|
||||
/// Above 0/4 18:30 A/c
|
||||
/// Above 0/4 18:29 B/c
|
||||
/// Above 1/4 19:00 A/b
|
||||
/// Above 1/4 19:05 B/b
|
||||
/// Above 2/4 20:00 B/a
|
||||
/// Above 2/4 20:03 A/a
|
||||
/// Below 3/4 20:30 A/d
|
||||
/// Below 3/4 20:40 B/d
|
||||
/// Below 4/4 20:45 B/e
|
||||
/// Below 4/4 20:58 A/e
|
||||
/// ```
|
||||
///
|
||||
/// With tenants having the same number of layers the picture does not change much. The same with
|
||||
/// A having many more layers **resident** (not all of them listed):
|
||||
///
|
||||
/// ```text
|
||||
/// Above 0/100 18:30 A/c
|
||||
/// Above 0/4 18:29 B/c
|
||||
/// Above 1/100 19:00 A/b
|
||||
/// Above 2/100 20:03 A/a
|
||||
/// Above 3/100 20:03 A/nth_3
|
||||
/// Above 4/100 20:03 A/nth_4
|
||||
/// ...
|
||||
/// Above 1/4 19:05 B/b
|
||||
/// Above 25/100 20:04 A/nth_25
|
||||
/// ...
|
||||
/// Above 2/4 20:00 B/a
|
||||
/// Above 50/100 20:10 A/nth_50
|
||||
/// ...
|
||||
/// Below 3/4 20:40 B/d
|
||||
/// Below 99/100 20:30 A/nth_99
|
||||
/// Below 4/4 20:45 B/e
|
||||
/// Below 100/100 20:58 A/nth_100
|
||||
/// ```
|
||||
///
|
||||
/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
|
||||
/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
|
||||
/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
|
||||
/// appeared:
|
||||
///
|
||||
/// ```text
|
||||
/// Above 0/87 20:04 A/nth_23
|
||||
/// Above 0/3 19:05 B/b
|
||||
/// Above 0/50 20:59 C/nth_0
|
||||
/// Above 1/87 20:04 A/nth_24
|
||||
/// Above 1/50 21:00 C/nth_1
|
||||
/// Above 2/87 20:04 A/nth_25
|
||||
/// ...
|
||||
/// Above 16/50 21:02 C/nth_16
|
||||
/// Above 1/3 20:00 B/a
|
||||
/// Above 27/87 20:10 A/nth_50
|
||||
/// ...
|
||||
/// Below 2/3 20:40 B/d
|
||||
/// Below 49/50 21:05 C/nth_49
|
||||
/// Below 86/87 20:30 A/nth_99
|
||||
/// Below 3/3 20:45 B/e
|
||||
/// Below 50/50 21:05 C/nth_50
|
||||
/// Below 87/87 20:58 A/nth_100
|
||||
/// ```
|
||||
///
|
||||
/// Now relieving pressure with 23 layers would cost:
|
||||
/// - tenant A 14 layers
|
||||
/// - tenant B 1 layer
|
||||
/// - tenant C 8 layers
|
||||
async fn collect_eviction_candidates(
|
||||
eviction_order: EvictionOrder,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<EvictionCandidates> {
|
||||
// get a snapshot of the list of tenants
|
||||
@@ -591,12 +709,63 @@ async fn collect_eviction_candidates(
|
||||
tenant_candidates
|
||||
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
let mut cumsum: i128 = 0;
|
||||
for (timeline, layer_info) in tenant_candidates.into_iter() {
|
||||
|
||||
// keeping the -1 or not decides if every tenant should lose their least recently accessed
|
||||
// layer OR if this should happen in the order of having highest layer count:
|
||||
let fudge = if eviction_order.highest_layer_count_loses_first() {
|
||||
// relative_age vs. tenant layer count:
|
||||
// - 0.1..=1.0 (10 layers)
|
||||
// - 0.01..=1.0 (100 layers)
|
||||
// - 0.001..=1.0 (1000 layers)
|
||||
//
|
||||
// leading to evicting less of the smallest tenants.
|
||||
0
|
||||
} else {
|
||||
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
|
||||
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
|
||||
// be that less than 10k layer evictions is enough, so we would not need to evict from
|
||||
// all tenants.
|
||||
//
|
||||
// as the tenant ordering is now deterministic this could hit the same tenants
|
||||
// disproportionetly on multiple invocations. alternative could be to remember how many
|
||||
// layers did we evict last time from this tenant, and inject that as an additional
|
||||
// fudge here.
|
||||
1
|
||||
};
|
||||
|
||||
let total = tenant_candidates
|
||||
.len()
|
||||
.checked_sub(fudge)
|
||||
.filter(|&x| x > 0)
|
||||
// support 0 or 1 resident layer tenants as well
|
||||
.unwrap_or(1);
|
||||
let divider = total as f32;
|
||||
|
||||
for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
|
||||
let file_size = layer_info.file_size();
|
||||
|
||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
||||
// be 1.0; this is for us to evict it last.
|
||||
let relative_last_activity = if matches!(
|
||||
eviction_order,
|
||||
EvictionOrder::RelativeAccessed { .. }
|
||||
) {
|
||||
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
|
||||
// similarly for u16. unsure how it would help.
|
||||
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
|
||||
.unwrap_or_else(|val| {
|
||||
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
|
||||
finite_f32::FiniteF32::ZERO
|
||||
})
|
||||
} else {
|
||||
finite_f32::FiniteF32::ZERO
|
||||
};
|
||||
|
||||
let candidate = EvictionCandidate {
|
||||
timeline,
|
||||
last_activity_ts: layer_info.last_activity_ts,
|
||||
layer: layer_info.layer,
|
||||
relative_last_activity,
|
||||
};
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
@@ -610,8 +779,19 @@ async fn collect_eviction_candidates(
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
|
||||
match eviction_order {
|
||||
EvictionOrder::AbsoluteAccessed => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.last_activity_ts)
|
||||
});
|
||||
}
|
||||
EvictionOrder::RelativeAccessed { .. } => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
}
|
||||
@@ -640,6 +820,66 @@ impl std::ops::Deref for TimelineKey {
|
||||
}
|
||||
}
|
||||
|
||||
/// A totally ordered f32 subset we can use with sorting functions.
|
||||
mod finite_f32 {
|
||||
|
||||
/// A totally ordered f32 subset we can use with sorting functions.
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
pub struct FiniteF32(f32);
|
||||
|
||||
impl std::fmt::Debug for FiniteF32 {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FiniteF32 {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Display::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::Eq for FiniteF32 {}
|
||||
|
||||
impl std::cmp::PartialOrd for FiniteF32 {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::Ord for FiniteF32 {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.0.total_cmp(&other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<f32> for FiniteF32 {
|
||||
type Error = f32;
|
||||
|
||||
fn try_from(value: f32) -> Result<Self, Self::Error> {
|
||||
if value.is_finite() {
|
||||
Ok(FiniteF32(value))
|
||||
} else {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FiniteF32 {
|
||||
pub const ZERO: FiniteF32 = FiniteF32(0.0);
|
||||
|
||||
pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
|
||||
if (0.0..=1.0).contains(&value) {
|
||||
// -0.0 is within the range, make sure it is assumed 0.0..=1.0
|
||||
let value = value.abs();
|
||||
Ok(FiniteF32(value))
|
||||
} else {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod filesystem_level_usage {
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
@@ -721,6 +961,7 @@ mod filesystem_level_usage {
|
||||
|
||||
#[test]
|
||||
fn max_usage_pct_pressure() {
|
||||
use super::EvictionOrder;
|
||||
use super::Usage as _;
|
||||
use std::time::Duration;
|
||||
use utils::serde_percent::Percent;
|
||||
@@ -732,6 +973,7 @@ mod filesystem_level_usage {
|
||||
period: Duration::MAX,
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: EvictionOrder::default(),
|
||||
},
|
||||
total_bytes: 100_000,
|
||||
avail_bytes: 0,
|
||||
|
||||
@@ -159,6 +159,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"412":
|
||||
description: Deletion may not proceed, tenant is not in Active state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
|
||||
@@ -308,6 +308,7 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
SlotUpsertError(e) => e.into(),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
Cancelled => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -886,7 +887,9 @@ async fn tenant_delete_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
|
||||
state
|
||||
.tenant_manager
|
||||
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
|
||||
.instrument(info_span!("tenant_delete_handler",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard = %tenant_shard_id.shard_slug()
|
||||
@@ -1566,19 +1569,22 @@ async fn disk_usage_eviction_run(
|
||||
struct Config {
|
||||
/// How many bytes to evict before reporting that pressure is relieved.
|
||||
evict_bytes: u64,
|
||||
|
||||
#[serde(default)]
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||
struct Usage {
|
||||
// remains unchanged after instantiation of the struct
|
||||
config: Config,
|
||||
evict_bytes: u64,
|
||||
// updated by `add_available_bytes`
|
||||
freed_bytes: u64,
|
||||
}
|
||||
|
||||
impl crate::disk_usage_eviction_task::Usage for Usage {
|
||||
fn has_pressure(&self) -> bool {
|
||||
self.config.evict_bytes > self.freed_bytes
|
||||
self.evict_bytes > self.freed_bytes
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
@@ -1589,7 +1595,7 @@ async fn disk_usage_eviction_run(
|
||||
let config = json_request::<Config>(&mut r).await?;
|
||||
|
||||
let usage = Usage {
|
||||
config,
|
||||
evict_bytes: config.evict_bytes,
|
||||
freed_bytes: 0,
|
||||
};
|
||||
|
||||
@@ -1604,7 +1610,11 @@ async fn disk_usage_eviction_run(
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&state, storage, usage, &cancel,
|
||||
&state,
|
||||
storage,
|
||||
usage,
|
||||
config.eviction_order,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
||||
@@ -522,14 +522,18 @@ pub(crate) mod initial_logical_size {
|
||||
impl StartCalculation {
|
||||
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0.with_label_values(&["first", circumstances_label]);
|
||||
self.0
|
||||
.with_label_values(&["first", circumstances_label])
|
||||
.inc();
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
|
||||
}
|
||||
}
|
||||
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0.with_label_values(&["retry", circumstances_label]);
|
||||
self.0
|
||||
.with_label_values(&["retry", circumstances_label])
|
||||
.inc();
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
|
||||
}
|
||||
|
||||
@@ -1776,6 +1776,7 @@ pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
@@ -1790,7 +1791,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
@@ -3134,6 +3134,7 @@ impl Tenant {
|
||||
|
||||
/// For unit tests, make this visible so that other modules can directly create timelines
|
||||
#[cfg(test)]
|
||||
#[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub(crate) async fn bootstrap_timeline_test(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
@@ -48,6 +48,9 @@ pub(crate) enum DeleteTenantError {
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
@@ -514,10 +514,7 @@ pub async fn init_tenant_mgr(
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
tenants.insert(
|
||||
TenantShardId::unsharded(tenant.tenant_id()),
|
||||
TenantSlot::Attached(tenant),
|
||||
);
|
||||
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
|
||||
}
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
@@ -962,35 +959,27 @@ impl TenantManager {
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
|
||||
// Directory structure is the same for attached and secondary modes:
|
||||
// create it if it doesn't exist. Timeline load/creation expects the
|
||||
// timelines/ subdir to already exist.
|
||||
//
|
||||
// Does not need to be fsync'd because local storage is just a cache.
|
||||
tokio::fs::create_dir_all(&timelines_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
// Before activating either secondary or attached mode, persist the
|
||||
// configuration, so that on restart we will re-attach (or re-start
|
||||
// secondary) on the tenant.
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let new_slot = match &new_location_config.mode {
|
||||
LocationMode::Secondary(_) => {
|
||||
// Directory doesn't need to be fsync'd because if we crash it can
|
||||
// safely be recreated next time this tenant location is configured.
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {tenant_path}"))?;
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
TenantSlot::Secondary
|
||||
}
|
||||
LocationMode::Secondary(_) => TenantSlot::Secondary,
|
||||
LocationMode::Attached(_attach_config) => {
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
|
||||
// Directory doesn't need to be fsync'd because we do not depend on
|
||||
// it to exist after crashes: it may be recreated when tenant is
|
||||
// re-attached, see https://github.com/neondatabase/neon/issues/5550
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let shard_identity = new_location_config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
@@ -1102,6 +1091,71 @@ impl TenantManager {
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
activation_timeout: Duration,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
//
|
||||
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
|
||||
// Currently, deletion requires a reference to the tenants map in order to
|
||||
// keep the Tenant in the map until deletion is complete, and then remove
|
||||
// it at the end.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
TenantSlot::Attached(tenant) => tenant.clone(),
|
||||
_ => {
|
||||
// Express "not attached" as equivalent to "not found"
|
||||
return Err(DeleteTenantError::NotAttached);
|
||||
}
|
||||
};
|
||||
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// If a tenant is broken or stopping, DeleteTenantFlow can
|
||||
// handle it: broken tenants proceed to delete, stopping tenants
|
||||
// are checked for deletion already in progress.
|
||||
}
|
||||
_ => {
|
||||
tenant
|
||||
.wait_to_become_active(activation_timeout)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => {
|
||||
DeleteTenantError::InvalidState(tenant.current_state())
|
||||
}
|
||||
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
|
||||
GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
|
||||
GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state: _latest_state,
|
||||
wait_time: _wait_time,
|
||||
} => DeleteTenantError::InvalidState(tenant.current_state()),
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
let result = DeleteTenantFlow::run(
|
||||
self.conf,
|
||||
self.resources.remote_storage.clone(),
|
||||
&TENANTS,
|
||||
tenant,
|
||||
)
|
||||
.await;
|
||||
|
||||
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
|
||||
slot_guard.revert();
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1279,41 +1333,6 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
//
|
||||
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
|
||||
// Currently, deletion requires a reference to the tenants map in order to
|
||||
// keep the Tenant in the map until deletion is complete, and then remove
|
||||
// it at the end.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
// TODO(sharding): make delete API sharding-aware
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
TenantSlot::Attached(tenant) => tenant.clone(),
|
||||
_ => {
|
||||
// Express "not attached" as equivalent to "not found"
|
||||
return Err(DeleteTenantError::NotAttached);
|
||||
}
|
||||
};
|
||||
|
||||
let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
|
||||
|
||||
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
|
||||
slot_guard.revert();
|
||||
result
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DeleteTimelineError {
|
||||
#[error("Tenant {0}")]
|
||||
|
||||
@@ -2192,15 +2192,6 @@ mod tests {
|
||||
|
||||
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
|
||||
|
||||
let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
|
||||
let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
|
||||
timeline_path
|
||||
.strip_prefix(&test_state.harness.conf.workdir)
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
|
||||
|
||||
let index_path = test_state.harness.remote_fs_dir.join(
|
||||
remote_index_path(
|
||||
&test_state.harness.tenant_shard_id,
|
||||
@@ -2209,6 +2200,10 @@ mod tests {
|
||||
)
|
||||
.get_path(),
|
||||
);
|
||||
|
||||
std::fs::create_dir_all(index_path.parent().unwrap())
|
||||
.expect("creating test dir should work");
|
||||
|
||||
eprintln!("Writing {index_path}");
|
||||
std::fs::write(&index_path, index_part_bytes).unwrap();
|
||||
example_index_part
|
||||
|
||||
@@ -878,6 +878,23 @@ impl LayerInner {
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
let consecutive_failures =
|
||||
this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let backoff = utils::backoff::exponential_backoff_duration_seconds(
|
||||
consecutive_failures.min(u32::MAX as usize) as u32,
|
||||
1.5,
|
||||
60.0,
|
||||
);
|
||||
|
||||
let backoff = std::time::Duration::from_secs_f64(backoff);
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(backoff) => {},
|
||||
_ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
|
||||
_ = timeline.cancel.cancelled() => {},
|
||||
};
|
||||
|
||||
Err(e)
|
||||
}
|
||||
};
|
||||
@@ -926,21 +943,9 @@ impl LayerInner {
|
||||
Ok(permit)
|
||||
}
|
||||
Ok((Err(e), _permit)) => {
|
||||
// FIXME: this should be with the spawned task and be cancellation sensitive
|
||||
//
|
||||
// while we should not need this, this backoff has turned out to be useful with
|
||||
// a bug of unexpectedly deleted remote layer file (#5787).
|
||||
let consecutive_failures =
|
||||
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||
// sleep already happened in the spawned task, if it was not cancelled
|
||||
let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
|
||||
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
|
||||
let backoff = utils::backoff::exponential_backoff_duration_seconds(
|
||||
consecutive_failures.min(u32::MAX as usize) as u32,
|
||||
1.5,
|
||||
60.0,
|
||||
);
|
||||
let backoff = std::time::Duration::from_secs_f64(backoff);
|
||||
|
||||
tokio::time::sleep(backoff).await;
|
||||
Err(DownloadError::DownloadFailed)
|
||||
}
|
||||
Err(_gone) => Err(DownloadError::DownloadCancelled),
|
||||
|
||||
@@ -1612,6 +1612,7 @@ impl<'a> WalIngest<'a> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
|
||||
use crate::tenant::Timeline;
|
||||
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use postgres_ffi::RELSEG_SIZE;
|
||||
@@ -2177,21 +2178,25 @@ mod tests {
|
||||
let pg_version = 15; // The test data was generated by pg15
|
||||
let path = "test_data/sk_wal_segment_from_pgbench";
|
||||
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
|
||||
let source_initdb_path = format!("{path}/{INITDB_PATH}");
|
||||
let startpoint = Lsn::from_hex("14AEC08").unwrap();
|
||||
let endpoint = Lsn::from_hex("1FFFF98").unwrap();
|
||||
|
||||
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
|
||||
let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
|
||||
|
||||
std::fs::create_dir_all(initdb_path.parent().unwrap())
|
||||
.expect("creating test dir should work");
|
||||
std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works");
|
||||
|
||||
// Bootstrap a real timeline. We can't use create_test_timeline because
|
||||
// it doesn't create a real checkpoint, and Walingest::new tries to parse
|
||||
// the garbage data.
|
||||
//
|
||||
// TODO use the initdb.tar.zst file stored with the test data to avoid
|
||||
// problems with inconsistent initdb results after pg minor version bumps.
|
||||
let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
|
||||
.bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ OBJS = \
|
||||
libpagestore.o \
|
||||
neon.o \
|
||||
neon_utils.o \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
walproposer.o \
|
||||
|
||||
96
pgxn/neon/libpqwalproposer.h
Normal file
96
pgxn/neon/libpqwalproposer.h
Normal file
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Interface to set of libpq wrappers walproposer and neon_walreader need.
|
||||
* Similar to libpqwalreceiver, but it has blocking connection establishment and
|
||||
* pqexec which don't fit us. Implementation is at walproposer_pg.c.
|
||||
*/
|
||||
#ifndef ___LIBPQWALPROPOSER_H__
|
||||
#define ___LIBPQWALPROPOSER_H__
|
||||
|
||||
/* Re-exported and modified ExecStatusType */
|
||||
typedef enum
|
||||
{
|
||||
/* We received a single CopyBoth result */
|
||||
WP_EXEC_SUCCESS_COPYBOTH,
|
||||
|
||||
/*
|
||||
* Any success result other than a single CopyBoth was received. The
|
||||
* specifics of the result were already logged, but it may be useful to
|
||||
* provide an error message indicating which safekeeper messed up.
|
||||
*
|
||||
* Do not expect PQerrorMessage to be appropriately set.
|
||||
*/
|
||||
WP_EXEC_UNEXPECTED_SUCCESS,
|
||||
|
||||
/*
|
||||
* No result available at this time. Wait until read-ready, then call
|
||||
* again. Internally, this is returned when PQisBusy indicates that
|
||||
* PQgetResult would block.
|
||||
*/
|
||||
WP_EXEC_NEEDS_INPUT,
|
||||
/* Catch-all failure. Check PQerrorMessage. */
|
||||
WP_EXEC_FAILED,
|
||||
} WalProposerExecStatusType;
|
||||
|
||||
/* Possible return values from walprop_async_read */
|
||||
typedef enum
|
||||
{
|
||||
/* The full read was successful. buf now points to the data */
|
||||
PG_ASYNC_READ_SUCCESS,
|
||||
|
||||
/*
|
||||
* The read is ongoing. Wait until the connection is read-ready, then try
|
||||
* again.
|
||||
*/
|
||||
PG_ASYNC_READ_TRY_AGAIN,
|
||||
/* Reading failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_READ_FAIL,
|
||||
} PGAsyncReadResult;
|
||||
|
||||
/* Possible return values from walprop_async_write */
|
||||
typedef enum
|
||||
{
|
||||
/* The write fully completed */
|
||||
PG_ASYNC_WRITE_SUCCESS,
|
||||
|
||||
/*
|
||||
* The write started, but you'll need to call PQflush some more times to
|
||||
* finish it off. We just tried, so it's best to wait until the connection
|
||||
* is read- or write-ready to try again.
|
||||
*
|
||||
* If it becomes read-ready, call PQconsumeInput and flush again. If it
|
||||
* becomes write-ready, just call PQflush.
|
||||
*/
|
||||
PG_ASYNC_WRITE_TRY_FLUSH,
|
||||
/* Writing failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_WRITE_FAIL,
|
||||
} PGAsyncWriteResult;
|
||||
|
||||
/*
|
||||
* This header is included by walproposer.h to define walproposer_api; if we're
|
||||
* building walproposer without pg, ignore libpq part, leaving only interface
|
||||
* types.
|
||||
*/
|
||||
#ifndef WALPROPOSER_LIB
|
||||
|
||||
#include "libpq-fe.h"
|
||||
|
||||
/*
|
||||
* Sometimes working directly with underlying PGconn is simpler, export the
|
||||
* whole thing for simplicity.
|
||||
*/
|
||||
typedef struct WalProposerConn
|
||||
{
|
||||
PGconn *pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received CopyData message from
|
||||
* walprop_async_read */
|
||||
} WalProposerConn;
|
||||
|
||||
extern WalProposerConn *libpqwp_connect_start(char *conninfo);
|
||||
extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
|
||||
extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
|
||||
extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
|
||||
extern void libpqwp_disconnect(WalProposerConn *conn);
|
||||
|
||||
#endif /* WALPROPOSER_LIB */
|
||||
#endif /* ___LIBPQWALPROPOSER_H__ */
|
||||
742
pgxn/neon/neon_walreader.c
Normal file
742
pgxn/neon/neon_walreader.c
Normal file
@@ -0,0 +1,742 @@
|
||||
/*
|
||||
* Like WALRead, but when WAL segment doesn't exist locally instead of throwing
|
||||
* ERROR asynchronously tries to fetch it from the most advanced safekeeper.
|
||||
*
|
||||
* We can't use libpqwalreceiver as it blocks during connection establishment
|
||||
* (and waiting for PQExec result), so use libpqwalproposer instead.
|
||||
*
|
||||
* TODO: keepalives are currently never sent, so the other side can close the
|
||||
* connection prematurely.
|
||||
*
|
||||
* TODO: close conn if reading takes too long to prevent stuck connections.
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/xlogdefs.h"
|
||||
#include "access/xlogreader.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "storage/fd.h"
|
||||
#include "utils/wait_event.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
#define NEON_WALREADER_ERR_MSG_LEN 512
|
||||
|
||||
/*
|
||||
* Can be called where NeonWALReader *state is available in the context, adds log_prefix.
|
||||
*/
|
||||
#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
|
||||
|
||||
static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
|
||||
static void NeonWALReaderResetRemote(NeonWALReader *state);
|
||||
static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
|
||||
static void neon_wal_segment_close(NeonWALReader *state);
|
||||
static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
|
||||
TimeLineID tli);
|
||||
|
||||
/*
|
||||
* State of connection to donor safekeeper.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
/* no remote connection */
|
||||
RS_NONE,
|
||||
/* doing PQconnectPoll, need readable socket */
|
||||
RS_CONNECTING_READ,
|
||||
/* doing PQconnectPoll, need writable socket */
|
||||
RS_CONNECTING_WRITE,
|
||||
/* Waiting for START_REPLICATION result */
|
||||
RS_WAIT_EXEC_RESULT,
|
||||
/* replication stream established */
|
||||
RS_ESTABLISHED,
|
||||
} NeonWALReaderRemoteState;
|
||||
|
||||
struct NeonWALReader
|
||||
{
|
||||
/*
|
||||
* LSN before which we assume WAL is not available locally. Exists because
|
||||
* though first segment after startup always exists, part before
|
||||
* basebackup LSN is filled with zeros.
|
||||
*/
|
||||
XLogRecPtr available_lsn;
|
||||
WALSegmentContext segcxt;
|
||||
WALOpenSegment seg;
|
||||
int wre_errno;
|
||||
/* Explains failure to read, static for simplicity. */
|
||||
char err_msg[NEON_WALREADER_ERR_MSG_LEN];
|
||||
|
||||
/*
|
||||
* Saved info about request in progress, used to check validity of
|
||||
* arguments after resume and remember how far we accomplished it. req_lsn
|
||||
* is 0 if there is no request in progress.
|
||||
*/
|
||||
XLogRecPtr req_lsn;
|
||||
Size req_len;
|
||||
Size req_progress;
|
||||
WalProposer *wp; /* we learn donor through walproposer */
|
||||
char donor_name[64]; /* saved donor safekeeper name for logging */
|
||||
/* state of connection to safekeeper */
|
||||
NeonWALReaderRemoteState rem_state;
|
||||
WalProposerConn *wp_conn;
|
||||
|
||||
/*
|
||||
* position in wp_conn recvbuf from which we'll copy WAL next time, or
|
||||
* NULL if there is no unprocessed message
|
||||
*/
|
||||
char *wal_ptr;
|
||||
Size wal_rem_len; /* how many unprocessed bytes left in recvbuf */
|
||||
|
||||
/*
|
||||
* LSN of wal_ptr position according to walsender to cross check against
|
||||
* read request
|
||||
*/
|
||||
XLogRecPtr rem_lsn;
|
||||
|
||||
/* prepended to lines logged by neon_walreader, if provided */
|
||||
char log_prefix[64];
|
||||
};
|
||||
|
||||
/* palloc and initialize NeonWALReader */
|
||||
NeonWALReader *
|
||||
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
|
||||
{
|
||||
NeonWALReader *reader;
|
||||
|
||||
reader = (NeonWALReader *)
|
||||
palloc_extended(sizeof(NeonWALReader),
|
||||
MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
|
||||
if (!reader)
|
||||
return NULL;
|
||||
|
||||
reader->available_lsn = available_lsn;
|
||||
reader->seg.ws_file = -1;
|
||||
reader->seg.ws_segno = 0;
|
||||
reader->seg.ws_tli = 0;
|
||||
reader->segcxt.ws_segsize = wal_segment_size;
|
||||
|
||||
reader->wp = wp;
|
||||
|
||||
reader->rem_state = RS_NONE;
|
||||
|
||||
if (log_prefix)
|
||||
strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
|
||||
|
||||
return reader;
|
||||
}
|
||||
|
||||
void
|
||||
NeonWALReaderFree(NeonWALReader *state)
|
||||
{
|
||||
if (state->seg.ws_file != -1)
|
||||
neon_wal_segment_close(state);
|
||||
if (state->wp_conn)
|
||||
libpqwp_disconnect(state->wp_conn);
|
||||
pfree(state);
|
||||
}
|
||||
|
||||
/*
|
||||
* Like vanilla WALRead, but if requested position is before available_lsn or
|
||||
* WAL segment doesn't exist on disk, it tries to fetch needed segment from the
|
||||
* advanced safekeeper.
|
||||
*
|
||||
* Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
|
||||
* fetched from timeline 'tli'.
|
||||
*
|
||||
* Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
|
||||
* occurs, in which case 'err' has the desciption. Error always closes remote
|
||||
* connection, if there was any, so socket subscription should be removed.
|
||||
*
|
||||
* NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
|
||||
* NeonWALReaderSocket and call NeonWALRead again with exactly the same
|
||||
* arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
|
||||
* docs during connection establishment (before first successful read) socket
|
||||
* underneath might change.
|
||||
*
|
||||
* Also, eventually walreader should switch from remote to local read; caller
|
||||
* should remove subscription to socket then by checking NeonWALReaderEvents
|
||||
* after successful read (otherwise next read might reopen the connection with
|
||||
* different socket).
|
||||
*
|
||||
* Reading not monotonically is not supported and will result in error.
|
||||
*
|
||||
* Caller should be sure that WAL up to requested LSN exists, otherwise
|
||||
* NEON_WALREAD_WOULDBLOCK might be always returned.
|
||||
*/
|
||||
NeonWALReadResult
|
||||
NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
/*
|
||||
* If requested data is before known available basebackup lsn or there is
|
||||
* already active remote state, do remote read.
|
||||
*/
|
||||
if (startptr < state->available_lsn || state->rem_state != RS_NONE)
|
||||
{
|
||||
return NeonWALReadRemote(state, buf, startptr, count, tli);
|
||||
}
|
||||
if (NeonWALReadLocal(state, buf, startptr, count, tli))
|
||||
{
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
else if (state->wre_errno == ENOENT)
|
||||
{
|
||||
nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
|
||||
LSN_FORMAT_ARGS(startptr));
|
||||
return NeonWALReadRemote(state, buf, startptr, count, tli);
|
||||
}
|
||||
else
|
||||
{
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* Do the read from remote safekeeper. */
|
||||
static NeonWALReadResult
|
||||
NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
if (state->rem_state == RS_NONE)
|
||||
{
|
||||
XLogRecPtr donor_lsn;
|
||||
|
||||
/* no connection yet; start one */
|
||||
Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
|
||||
|
||||
if (donor == NULL)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to establish remote connection to fetch WAL: no donor available");
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
|
||||
nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
|
||||
state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
|
||||
state->wp_conn = libpqwp_connect_start(donor->conninfo);
|
||||
if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to connect to %s to fetch WAL: immediately failed with %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
/* we'll poll immediately */
|
||||
state->rem_state = RS_CONNECTING_READ;
|
||||
}
|
||||
|
||||
if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
|
||||
{
|
||||
switch (PQconnectPoll(state->wp_conn->pg_conn))
|
||||
{
|
||||
case PGRES_POLLING_FAILED:
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to connect to %s to fetch WAL: poll error: %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
case PGRES_POLLING_READING:
|
||||
state->rem_state = RS_CONNECTING_READ;
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PGRES_POLLING_WRITING:
|
||||
state->rem_state = RS_CONNECTING_WRITE;
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PGRES_POLLING_OK:
|
||||
{
|
||||
/* connection successfully established */
|
||||
char start_repl_query[128];
|
||||
|
||||
snprintf(start_repl_query, sizeof(start_repl_query),
|
||||
"START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
|
||||
LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
|
||||
nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
|
||||
state->donor_name, start_repl_query);
|
||||
if (!libpqwp_send_query(state->wp_conn, start_repl_query))
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to send %s query to %s: %s",
|
||||
start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
state->rem_state = RS_WAIT_EXEC_RESULT;
|
||||
break;
|
||||
}
|
||||
|
||||
default: /* there is unused PGRES_POLLING_ACTIVE */
|
||||
Assert(false);
|
||||
return NEON_WALREAD_ERROR; /* keep the compiler quiet */
|
||||
}
|
||||
}
|
||||
|
||||
if (state->rem_state == RS_WAIT_EXEC_RESULT)
|
||||
{
|
||||
switch (libpqwp_get_query_result(state->wp_conn))
|
||||
{
|
||||
case WP_EXEC_SUCCESS_COPYBOTH:
|
||||
state->rem_state = RS_ESTABLISHED;
|
||||
break;
|
||||
case WP_EXEC_NEEDS_INPUT:
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case WP_EXEC_FAILED:
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"get START_REPLICATION result from %s failed: %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
default: /* can't happen */
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"get START_REPLICATION result from %s: unexpected result",
|
||||
state->donor_name);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
Assert(state->rem_state == RS_ESTABLISHED);
|
||||
|
||||
/*
|
||||
* If we had the request before, verify args are the same and advance the
|
||||
* result ptr according to the progress; otherwise register the request.
|
||||
*/
|
||||
if (state->req_lsn != InvalidXLogRecPtr)
|
||||
{
|
||||
if (state->req_lsn != startptr || state->req_len != count)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"args changed during request, was %X/%X %zu, now %X/%X %zu",
|
||||
LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
|
||||
LSN_FORMAT_ARGS(startptr),
|
||||
count,
|
||||
state->req_progress);
|
||||
buf += state->req_progress;
|
||||
}
|
||||
else
|
||||
{
|
||||
state->req_lsn = startptr;
|
||||
state->req_len = count;
|
||||
state->req_progress = 0;
|
||||
nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
|
||||
LSN_FORMAT_ARGS(startptr),
|
||||
count);
|
||||
}
|
||||
|
||||
while (true)
|
||||
{
|
||||
Size to_copy;
|
||||
|
||||
/*
|
||||
* If we have no ready data, receive new message.
|
||||
*/
|
||||
if (state->wal_rem_len == 0 &&
|
||||
|
||||
/*
|
||||
* check for the sake of 0 length reads; walproposer does these for
|
||||
* heartbeats, though generally they shouldn't hit remote source.
|
||||
*/
|
||||
state->req_len - state->req_progress > 0)
|
||||
{
|
||||
NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
|
||||
|
||||
if (read_msg_res != NEON_WALREAD_SUCCESS)
|
||||
return read_msg_res;
|
||||
}
|
||||
|
||||
if (state->req_lsn + state->req_progress != state->rem_lsn)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
|
||||
LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
|
||||
LSN_FORMAT_ARGS(state->rem_lsn),
|
||||
LSN_FORMAT_ARGS(state->req_lsn),
|
||||
state->req_len);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
|
||||
/* We can copy min of (available, requested) bytes. */
|
||||
to_copy =
|
||||
Min(state->req_len - state->req_progress, state->wal_rem_len);
|
||||
memcpy(buf, state->wal_ptr, to_copy);
|
||||
state->wal_ptr += to_copy;
|
||||
state->wal_rem_len -= to_copy;
|
||||
state->rem_lsn += to_copy;
|
||||
if (state->wal_rem_len == 0)
|
||||
state->wal_ptr = NULL; /* freed by libpqwalproposer */
|
||||
buf += to_copy;
|
||||
state->req_progress += to_copy;
|
||||
if (state->req_progress == state->req_len)
|
||||
{
|
||||
XLogSegNo next_segno;
|
||||
XLogSegNo req_segno;
|
||||
|
||||
XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
|
||||
XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
|
||||
|
||||
/*
|
||||
* Request completed. If there is a chance of serving next one
|
||||
* locally, close the connection.
|
||||
*/
|
||||
if (state->req_lsn < state->available_lsn &&
|
||||
state->rem_lsn >= state->available_lsn)
|
||||
{
|
||||
nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
|
||||
LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
}
|
||||
else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
|
||||
is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
|
||||
{
|
||||
nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
|
||||
LSN_FORMAT_ARGS(state->rem_lsn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
}
|
||||
state->req_lsn = InvalidXLogRecPtr;
|
||||
state->req_len = 0;
|
||||
state->req_progress = 0;
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read one WAL message from the stream, sets state->wal_ptr in case of success.
|
||||
* Resets remote state in case of failure.
|
||||
*/
|
||||
static NeonWALReadResult
|
||||
NeonWALReaderReadMsg(NeonWALReader *state)
|
||||
{
|
||||
while (true) /* loop until we get 'w' */
|
||||
{
|
||||
char *copydata_ptr;
|
||||
int copydata_size;
|
||||
StringInfoData s;
|
||||
char msg_type;
|
||||
int hdrlen;
|
||||
|
||||
Assert(state->rem_state == RS_ESTABLISHED);
|
||||
Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
|
||||
|
||||
switch (libpqwp_async_read(state->wp_conn,
|
||||
©data_ptr,
|
||||
©data_size))
|
||||
{
|
||||
case PG_ASYNC_READ_SUCCESS:
|
||||
break;
|
||||
case PG_ASYNC_READ_TRY_AGAIN:
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
|
||||
LSN_FORMAT_ARGS(state->req_lsn),
|
||||
state->req_len,
|
||||
state->req_progress,
|
||||
PQerrorMessage(state->wp_conn->pg_conn));
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* put data on StringInfo to parse */
|
||||
s.data = copydata_ptr;
|
||||
s.len = copydata_size;
|
||||
s.cursor = 0;
|
||||
s.maxlen = -1;
|
||||
|
||||
if (copydata_size == 0)
|
||||
{
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"zero length copydata received");
|
||||
goto err;
|
||||
}
|
||||
msg_type = pq_getmsgbyte(&s);
|
||||
switch (msg_type)
|
||||
{
|
||||
case 'w':
|
||||
{
|
||||
XLogRecPtr start_lsn;
|
||||
|
||||
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
|
||||
if (s.len - s.cursor < hdrlen)
|
||||
{
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"invalid WAL message received from primary");
|
||||
goto err;
|
||||
}
|
||||
|
||||
start_lsn = pq_getmsgint64(&s);
|
||||
pq_getmsgint64(&s); /* XLogRecPtr end_lsn; */
|
||||
pq_getmsgint64(&s); /* TimestampTz send_time */
|
||||
|
||||
state->rem_lsn = start_lsn;
|
||||
state->wal_rem_len = (Size) (s.len - s.cursor);
|
||||
state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
|
||||
nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
|
||||
LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
|
||||
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
case 'k':
|
||||
{
|
||||
XLogRecPtr end_lsn;
|
||||
bool reply_requested;
|
||||
|
||||
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
|
||||
if (s.len - s.cursor < hdrlen)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"invalid keepalive message received from primary");
|
||||
goto err;
|
||||
}
|
||||
|
||||
end_lsn = pq_getmsgint64(&s);
|
||||
pq_getmsgint64(&s); /* TimestampTz timestamp; */
|
||||
reply_requested = pq_getmsgbyte(&s);
|
||||
nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
|
||||
LSN_FORMAT_ARGS(end_lsn),
|
||||
reply_requested);
|
||||
if (end_lsn < state->req_lsn + state->req_len)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
|
||||
LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
|
||||
goto err;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
default:
|
||||
nwr_log(WARNING, "invalid replication message type %d", msg_type);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
err:
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
|
||||
/* reset remote connection and request in progress */
|
||||
static void
|
||||
NeonWALReaderResetRemote(NeonWALReader *state)
|
||||
{
|
||||
state->req_lsn = InvalidXLogRecPtr;
|
||||
state->req_len = 0;
|
||||
state->req_progress = 0;
|
||||
state->rem_state = RS_NONE;
|
||||
if (state->wp_conn)
|
||||
{
|
||||
libpqwp_disconnect(state->wp_conn);
|
||||
state->wp_conn = NULL;
|
||||
}
|
||||
state->donor_name[0] = '\0';
|
||||
state->wal_ptr = NULL;
|
||||
state->wal_rem_len = 0;
|
||||
state->rem_lsn = InvalidXLogRecPtr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return socket of connection to remote source. Must be called only when
|
||||
* connection exists (NeonWALReaderEvents returns non zero).
|
||||
*/
|
||||
pgsocket
|
||||
NeonWALReaderSocket(NeonWALReader *state)
|
||||
{
|
||||
if (!state->wp_conn)
|
||||
nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
|
||||
return PQsocket(state->wp_conn->pg_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* Whether remote connection is established. Once this is done, until successful
|
||||
* local read or error socket is stable and user can update socket events
|
||||
* instead of readding it each time.
|
||||
*/
|
||||
bool
|
||||
NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
|
||||
{
|
||||
return state->rem_state == RS_ESTABLISHED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns events user should wait on connection socket or 0 if remote
|
||||
* connection is not active.
|
||||
*/
|
||||
extern uint32
|
||||
NeonWALReaderEvents(NeonWALReader *state)
|
||||
{
|
||||
switch (state->rem_state)
|
||||
{
|
||||
case RS_NONE:
|
||||
return 0;
|
||||
case RS_CONNECTING_READ:
|
||||
return WL_SOCKET_READABLE;
|
||||
case RS_CONNECTING_WRITE:
|
||||
return WL_SOCKET_WRITEABLE;
|
||||
case RS_WAIT_EXEC_RESULT:
|
||||
case RS_ESTABLISHED:
|
||||
return WL_SOCKET_READABLE;
|
||||
default:
|
||||
Assert(false);
|
||||
return 0; /* make compiler happy */
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
char *p;
|
||||
XLogRecPtr recptr;
|
||||
Size nbytes;
|
||||
|
||||
p = buf;
|
||||
recptr = startptr;
|
||||
nbytes = count;
|
||||
|
||||
while (nbytes > 0)
|
||||
{
|
||||
uint32 startoff;
|
||||
int segbytes;
|
||||
int readbytes;
|
||||
|
||||
startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
|
||||
|
||||
/*
|
||||
* If the data we want is not in a segment we have open, close what we
|
||||
* have (if anything) and open the next one, using the caller's
|
||||
* provided openSegment callback.
|
||||
*/
|
||||
if (state->seg.ws_file < 0 ||
|
||||
!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
|
||||
tli != state->seg.ws_tli)
|
||||
{
|
||||
XLogSegNo nextSegNo;
|
||||
|
||||
neon_wal_segment_close(state);
|
||||
|
||||
XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
|
||||
if (!neon_wal_segment_open(state, nextSegNo, &tli))
|
||||
{
|
||||
char fname[MAXFNAMELEN];
|
||||
|
||||
state->wre_errno = errno;
|
||||
|
||||
XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
|
||||
fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
/* This shouldn't happen -- indicates a bug in segment_open */
|
||||
Assert(state->seg.ws_file >= 0);
|
||||
|
||||
/* Update the current segment info. */
|
||||
state->seg.ws_tli = tli;
|
||||
state->seg.ws_segno = nextSegNo;
|
||||
}
|
||||
|
||||
/* How many bytes are within this segment? */
|
||||
if (nbytes > (state->segcxt.ws_segsize - startoff))
|
||||
segbytes = state->segcxt.ws_segsize - startoff;
|
||||
else
|
||||
segbytes = nbytes;
|
||||
|
||||
#ifndef FRONTEND
|
||||
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
|
||||
#endif
|
||||
|
||||
/* Reset errno first; eases reporting non-errno-affecting errors */
|
||||
errno = 0;
|
||||
readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
|
||||
|
||||
#ifndef FRONTEND
|
||||
pgstat_report_wait_end();
|
||||
#endif
|
||||
|
||||
if (readbytes <= 0)
|
||||
{
|
||||
char fname[MAXFNAMELEN];
|
||||
|
||||
XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
|
||||
|
||||
if (readbytes < 0)
|
||||
{
|
||||
state->wre_errno = errno;
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
|
||||
fname, startoff, strerror(state->wre_errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
|
||||
fname, startoff);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Update state for read */
|
||||
recptr += readbytes;
|
||||
nbytes -= readbytes;
|
||||
p += readbytes;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy of vanilla wal_segment_open, but returns false in case of error instead
|
||||
* of ERROR, with errno set.
|
||||
*
|
||||
* XLogReaderRoutine->segment_open callback for local pg_wal files
|
||||
*/
|
||||
static bool
|
||||
neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
|
||||
TimeLineID *tli_p)
|
||||
{
|
||||
TimeLineID tli = *tli_p;
|
||||
char path[MAXPGPATH];
|
||||
|
||||
XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
|
||||
nwr_log(DEBUG5, "opening %s", path);
|
||||
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
|
||||
if (state->seg.ws_file >= 0)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
|
||||
{
|
||||
struct stat stat_buffer;
|
||||
char path[MAXPGPATH];
|
||||
|
||||
XLogFilePath(path, tli, segno, segsize);
|
||||
return stat(path, &stat_buffer) == 0;
|
||||
}
|
||||
|
||||
/* copy of vanilla wal_segment_close with NeonWALReader */
|
||||
static void
|
||||
neon_wal_segment_close(NeonWALReader *state)
|
||||
{
|
||||
if (state->seg.ws_file >= 0)
|
||||
{
|
||||
close(state->seg.ws_file);
|
||||
/* need to check errno? */
|
||||
state->seg.ws_file = -1;
|
||||
}
|
||||
}
|
||||
|
||||
char *
|
||||
NeonWALReaderErrMsg(NeonWALReader *state)
|
||||
{
|
||||
return state->err_msg;
|
||||
}
|
||||
30
pgxn/neon/neon_walreader.h
Normal file
30
pgxn/neon/neon_walreader.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef __NEON_WALREADER_H__
|
||||
#define __NEON_WALREADER_H__
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
|
||||
/* forward declare so we don't have to expose the struct to the public */
|
||||
struct NeonWALReader;
|
||||
typedef struct NeonWALReader NeonWALReader;
|
||||
|
||||
/* avoid including walproposer.h as it includes us */
|
||||
struct WalProposer;
|
||||
typedef struct WalProposer WalProposer;
|
||||
|
||||
/* NeonWALRead return value */
|
||||
typedef enum
|
||||
{
|
||||
NEON_WALREAD_SUCCESS,
|
||||
NEON_WALREAD_WOULDBLOCK,
|
||||
NEON_WALREAD_ERROR,
|
||||
} NeonWALReadResult;
|
||||
|
||||
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
|
||||
extern void NeonWALReaderFree(NeonWALReader *state);
|
||||
extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
|
||||
extern uint32 NeonWALReaderEvents(NeonWALReader *state);
|
||||
extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
|
||||
extern char *NeonWALReaderErrMsg(NeonWALReader *state);
|
||||
|
||||
#endif /* __NEON_WALREADER_H__ */
|
||||
@@ -45,7 +45,6 @@
|
||||
|
||||
/* Prototypes for private functions */
|
||||
static void WalProposerLoop(WalProposer *wp);
|
||||
static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
|
||||
static void ShutdownConnection(Safekeeper *sk);
|
||||
static void ResetConnection(Safekeeper *sk);
|
||||
static long TimeToReconnect(WalProposer *wp, TimestampTz now);
|
||||
@@ -78,11 +77,11 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper
|
||||
static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
|
||||
static bool AsyncFlush(Safekeeper *sk);
|
||||
static int CompareLsn(const void *a, const void *b);
|
||||
static char *FormatSafekeeperState(SafekeeperState state);
|
||||
static char *FormatSafekeeperState(Safekeeper *sk);
|
||||
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
|
||||
static uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
|
||||
static char *FormatEvents(WalProposer *wp, uint32 events);
|
||||
|
||||
|
||||
WalProposer *
|
||||
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
{
|
||||
@@ -113,6 +112,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
wp->safekeeper[wp->n_safekeepers].host = host;
|
||||
wp->safekeeper[wp->n_safekeepers].port = port;
|
||||
wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
|
||||
wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND;
|
||||
wp->safekeeper[wp->n_safekeepers].wp = wp;
|
||||
|
||||
{
|
||||
@@ -127,8 +127,6 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
}
|
||||
|
||||
initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
|
||||
wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]);
|
||||
wp->safekeeper[wp->n_safekeepers].flushWrite = false;
|
||||
wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
|
||||
wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
|
||||
wp->n_safekeepers += 1;
|
||||
@@ -277,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
|
||||
wp->config->safekeeper_connection_timeout))
|
||||
{
|
||||
walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
|
||||
sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
|
||||
ShutdownConnection(sk);
|
||||
}
|
||||
}
|
||||
@@ -305,58 +303,20 @@ WalProposerLoop(WalProposer *wp)
|
||||
WalProposerPoll(wp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
|
||||
*
|
||||
* Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
|
||||
*/
|
||||
static void
|
||||
HackyRemoveWalProposerEvent(Safekeeper *to_remove)
|
||||
{
|
||||
WalProposer *wp = to_remove->wp;
|
||||
|
||||
/* Remove the existing event set, assign sk->eventPos = -1 */
|
||||
wp->api.free_event_set(wp);
|
||||
/* Re-initialize it without adding any safekeeper events */
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
/*
|
||||
* loop through the existing safekeepers. If they aren't the one we're
|
||||
* removing, and if they have a socket we can use, re-add the applicable
|
||||
* events.
|
||||
*/
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
uint32 desired_events = WL_NO_EVENTS;
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
if (sk == to_remove)
|
||||
continue;
|
||||
|
||||
/* If this safekeeper isn't offline, add an event for it! */
|
||||
if (sk->state != SS_OFFLINE)
|
||||
{
|
||||
desired_events = SafekeeperStateDesiredEvents(sk->state);
|
||||
/* will set sk->eventPos */
|
||||
wp->api.add_safekeeper_event_set(sk, desired_events);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
|
||||
static void
|
||||
ShutdownConnection(Safekeeper *sk)
|
||||
{
|
||||
sk->wp->api.conn_finish(sk);
|
||||
sk->state = SS_OFFLINE;
|
||||
sk->flushWrite = false;
|
||||
sk->streamingAt = InvalidXLogRecPtr;
|
||||
|
||||
if (sk->voteResponse.termHistory.entries)
|
||||
pfree(sk->voteResponse.termHistory.entries);
|
||||
sk->voteResponse.termHistory.entries = NULL;
|
||||
|
||||
HackyRemoveWalProposerEvent(sk);
|
||||
sk->wp->api.conn_finish(sk);
|
||||
sk->wp->api.rm_safekeeper_event_set(sk);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -474,7 +434,9 @@ ReconnectSafekeepers(WalProposer *wp)
|
||||
static void
|
||||
AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
#ifdef WALPROPOSER_LIB /* walprop_log needs wp in lib build */
|
||||
WalProposer *wp = sk->wp;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Sanity check. We assume further down that the operations don't block
|
||||
@@ -527,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
*/
|
||||
case SS_VOTING:
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -556,7 +518,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
*/
|
||||
case SS_IDLE:
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -622,7 +584,7 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
* Because PQconnectPoll can change the socket, we have to un-register the
|
||||
* old event and re-register an event on the new socket.
|
||||
*/
|
||||
HackyRemoveWalProposerEvent(sk);
|
||||
wp->api.rm_safekeeper_event_set(sk);
|
||||
wp->api.add_safekeeper_event_set(sk, new_events);
|
||||
|
||||
/* If we successfully connected, send START_WAL_PUSH query */
|
||||
@@ -847,7 +809,7 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
}
|
||||
else if (wp->n_votes > wp->quorum)
|
||||
{
|
||||
/* recovery already performed, just start streaming */
|
||||
/* already elected, start streaming */
|
||||
SendProposerElected(sk);
|
||||
}
|
||||
else
|
||||
@@ -873,21 +835,16 @@ HandleElectedProposer(WalProposer *wp)
|
||||
DetermineEpochStartLsn(wp);
|
||||
|
||||
/*
|
||||
* Check if not all safekeepers are up-to-date, we need to download WAL
|
||||
* needed to synchronize them
|
||||
* Synchronously download WAL from the most advanced safekeeper. We do
|
||||
* that only for logical replication (and switching logical walsenders to
|
||||
* neon_walreader is a todo.)
|
||||
*/
|
||||
if (wp->truncateLsn < wp->propEpochStartLsn)
|
||||
if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
|
||||
{
|
||||
walprop_log(LOG,
|
||||
"start recovery because truncateLsn=%X/%X is not "
|
||||
"equal to epochStartLsn=%X/%X",
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn),
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
/* Perform recovery */
|
||||
if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
|
||||
walprop_log(FATAL, "Failed to recover state");
|
||||
walprop_log(FATAL, "failed to download WAL for logical replicaiton");
|
||||
}
|
||||
else if (wp->config->syncSafekeepers)
|
||||
|
||||
if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
|
||||
{
|
||||
/* Sync is not needed: just exit */
|
||||
wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
|
||||
@@ -1085,13 +1042,6 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
}
|
||||
walprop_shared->mineLastElectedTerm = wp->propTerm;
|
||||
}
|
||||
|
||||
/*
|
||||
* WalProposer has just elected itself and initialized history, so we can
|
||||
* call election callback. Usually it updates truncateLsn to fetch WAL for
|
||||
* logical replication.
|
||||
*/
|
||||
wp->api.after_election(wp);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1112,6 +1062,9 @@ SendProposerElected(Safekeeper *sk)
|
||||
term_t lastCommonTerm;
|
||||
int i;
|
||||
|
||||
/* Now that we are ready to send it's a good moment to create WAL reader */
|
||||
wp->api.wal_reader_allocate(sk);
|
||||
|
||||
/*
|
||||
* Determine start LSN by comparing safekeeper's log term switch history
|
||||
* and proposer's, searching for the divergence point.
|
||||
@@ -1231,6 +1184,7 @@ StartStreaming(Safekeeper *sk)
|
||||
* once for a connection.
|
||||
*/
|
||||
sk->state = SS_ACTIVE;
|
||||
sk->active_state = SS_ACTIVE_SEND;
|
||||
sk->streamingAt = sk->startStreamingAt;
|
||||
|
||||
/* event set will be updated inside SendMessageToNode */
|
||||
@@ -1289,9 +1243,13 @@ HandleActiveState(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
uint32 newEvents = WL_SOCKET_READABLE;
|
||||
|
||||
if (events & WL_SOCKET_WRITEABLE)
|
||||
/*
|
||||
* Note: we don't known which socket awoke us (sk or nwr). However, as
|
||||
* SendAppendRequests always tries to send at least one msg in
|
||||
* SS_ACTIVE_SEND be careful not to go there if are only after sk
|
||||
* response, otherwise it'd create busy loop of pings.
|
||||
*/
|
||||
if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL)
|
||||
if (!SendAppendRequests(sk))
|
||||
return;
|
||||
|
||||
@@ -1299,28 +1257,29 @@ HandleActiveState(Safekeeper *sk, uint32 events)
|
||||
if (!RecvAppendResponses(sk))
|
||||
return;
|
||||
|
||||
/*
|
||||
* We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
|
||||
* in the buffer.
|
||||
*
|
||||
* LSN comparison checks if we have pending unsent messages. This check
|
||||
* isn't necessary now, because we always send append messages immediately
|
||||
* after arrival. But it's good to have it here in case we change this
|
||||
* behavior in the future.
|
||||
*/
|
||||
if (sk->streamingAt != wp->availableLsn || sk->flushWrite)
|
||||
newEvents |= WL_SOCKET_WRITEABLE;
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
|
||||
if (events & WL_SOCKET_CLOSED)
|
||||
{
|
||||
walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
|
||||
sk->host, sk->port);
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
wp->api.update_event_set(sk, newEvents);
|
||||
/* configures event set for yield whatever is the substate */
|
||||
wp->api.active_state_update_event_set(sk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Send WAL messages starting from sk->streamingAt until the end or non-writable
|
||||
* socket, whichever comes first. Caller should take care of updating event set.
|
||||
* Even if no unsent WAL is available, at least one empty message will be sent
|
||||
* as a heartbeat, if socket is ready.
|
||||
* socket or neon_walreader blocks, whichever comes first; active_state is
|
||||
* updated accordingly. Caller should take care of updating event set. Even if
|
||||
* no unsent WAL is available, at least one empty message will be sent as a
|
||||
* heartbeat, if socket is ready.
|
||||
*
|
||||
* Can change state if Async* functions encounter errors and reset connection.
|
||||
* Resets state and kills the connections if any error on them is encountered.
|
||||
* Returns false in this case, true otherwise.
|
||||
*/
|
||||
static bool
|
||||
@@ -1328,11 +1287,11 @@ SendAppendRequests(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
XLogRecPtr endLsn;
|
||||
AppendRequestHeader *req;
|
||||
PGAsyncWriteResult writeResult;
|
||||
bool sentAnything = false;
|
||||
AppendRequestHeader *req;
|
||||
|
||||
if (sk->flushWrite)
|
||||
if (sk->active_state == SS_ACTIVE_FLUSH)
|
||||
{
|
||||
if (!AsyncFlush(sk))
|
||||
|
||||
@@ -1343,76 +1302,101 @@ SendAppendRequests(Safekeeper *sk)
|
||||
return sk->state == SS_ACTIVE;
|
||||
|
||||
/* Event set will be updated in the end of HandleActiveState */
|
||||
sk->flushWrite = false;
|
||||
sk->active_state = SS_ACTIVE_SEND;
|
||||
}
|
||||
|
||||
while (sk->streamingAt != wp->availableLsn || !sentAnything)
|
||||
{
|
||||
sentAnything = true;
|
||||
|
||||
endLsn = sk->streamingAt;
|
||||
endLsn += MAX_SEND_SIZE;
|
||||
|
||||
/* if we went beyond available WAL, back off */
|
||||
if (endLsn > wp->availableLsn)
|
||||
if (sk->active_state == SS_ACTIVE_SEND)
|
||||
{
|
||||
endLsn = wp->availableLsn;
|
||||
sentAnything = true;
|
||||
|
||||
endLsn = sk->streamingAt;
|
||||
endLsn += MAX_SEND_SIZE;
|
||||
|
||||
/* if we went beyond available WAL, back off */
|
||||
if (endLsn > wp->availableLsn)
|
||||
{
|
||||
endLsn = wp->availableLsn;
|
||||
}
|
||||
|
||||
req = &sk->appendRequest;
|
||||
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
||||
|
||||
walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
/* write AppendRequest header */
|
||||
appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
|
||||
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
|
||||
sk->active_state = SS_ACTIVE_READ_WAL;
|
||||
}
|
||||
|
||||
req = &sk->appendRequest;
|
||||
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
||||
|
||||
walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
/* write AppendRequest header */
|
||||
appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
|
||||
|
||||
/* write the WAL itself */
|
||||
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
|
||||
/* wal_read will raise error on failure */
|
||||
wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req->endLsn - req->beginLsn);
|
||||
sk->outbuf.len += req->endLsn - req->beginLsn;
|
||||
|
||||
writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
|
||||
|
||||
/* Mark current message as sent, whatever the result is */
|
||||
sk->streamingAt = endLsn;
|
||||
|
||||
switch (writeResult)
|
||||
if (sk->active_state == SS_ACTIVE_READ_WAL)
|
||||
{
|
||||
case PG_ASYNC_WRITE_SUCCESS:
|
||||
/* Continue writing the next message */
|
||||
break;
|
||||
char *errmsg;
|
||||
|
||||
case PG_ASYNC_WRITE_TRY_FLUSH:
|
||||
req = &sk->appendRequest;
|
||||
|
||||
/*
|
||||
* * We still need to call PQflush some more to finish the
|
||||
* job. Caller function will handle this by setting right
|
||||
* event* set.
|
||||
*/
|
||||
sk->flushWrite = true;
|
||||
return true;
|
||||
switch (wp->api.wal_read(sk,
|
||||
&sk->outbuf.data[sk->outbuf.len],
|
||||
req->beginLsn,
|
||||
req->endLsn - req->beginLsn,
|
||||
&errmsg))
|
||||
{
|
||||
case NEON_WALREAD_SUCCESS:
|
||||
break;
|
||||
case NEON_WALREAD_WOULDBLOCK:
|
||||
return true;
|
||||
case NEON_WALREAD_ERROR:
|
||||
walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||
sk->host, sk->port, errmsg);
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
Assert(false);
|
||||
}
|
||||
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
Assert(false);
|
||||
return false;
|
||||
sk->outbuf.len += req->endLsn - req->beginLsn;
|
||||
|
||||
writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
|
||||
|
||||
/* Mark current message as sent, whatever the result is */
|
||||
sk->streamingAt = req->endLsn;
|
||||
|
||||
switch (writeResult)
|
||||
{
|
||||
case PG_ASYNC_WRITE_SUCCESS:
|
||||
/* Continue writing the next message */
|
||||
sk->active_state = SS_ACTIVE_SEND;
|
||||
break;
|
||||
|
||||
case PG_ASYNC_WRITE_TRY_FLUSH:
|
||||
|
||||
/*
|
||||
* We still need to call PQflush some more to finish the
|
||||
* job. Caller function will handle this by setting right
|
||||
* event set.
|
||||
*/
|
||||
sk->active_state = SS_ACTIVE_FLUSH;
|
||||
return true;
|
||||
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
Assert(false);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1422,7 +1406,7 @@ SendAppendRequests(Safekeeper *sk)
|
||||
/*
|
||||
* Receive and process all available feedback.
|
||||
*
|
||||
* Can change state if Async* functions encounter errors and reset connection.
|
||||
* Resets state and kills the connection if any error on it is encountered.
|
||||
* Returns false in this case, true otherwise.
|
||||
*
|
||||
* NB: This function can call SendMessageToNode and produce new messages.
|
||||
@@ -1608,39 +1592,77 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
|
||||
return responses[wp->n_safekeepers - wp->quorum];
|
||||
}
|
||||
|
||||
/*
|
||||
* Return safekeeper with active connection from which WAL can be downloaded, or
|
||||
* none if it doesn't exist. donor_lsn is set to end position of the donor to
|
||||
* the best of our knowledge.
|
||||
*/
|
||||
Safekeeper *
|
||||
GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
|
||||
{
|
||||
*donor_lsn = InvalidXLogRecPtr;
|
||||
Safekeeper *donor = NULL;
|
||||
int i;
|
||||
|
||||
if (wp->n_votes < wp->quorum)
|
||||
{
|
||||
walprop_log(WARNING, "GetDonor called before elections are won");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* First, consider node which had determined our term start LSN as we know
|
||||
* about its position immediately after election before any feedbacks are
|
||||
* sent.
|
||||
*/
|
||||
if (wp->safekeeper[wp->donor].state >= SS_IDLE)
|
||||
{
|
||||
donor = &wp->safekeeper[wp->donor];
|
||||
*donor_lsn = wp->propEpochStartLsn;
|
||||
}
|
||||
|
||||
/*
|
||||
* But also check feedbacks from all nodes with live connections and take
|
||||
* the highest one. Note: if node sends feedbacks it already processed
|
||||
* elected message so its term is fine.
|
||||
*/
|
||||
for (i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
|
||||
{
|
||||
donor = sk;
|
||||
*donor_lsn = sk->appendResponse.flushLsn;
|
||||
}
|
||||
}
|
||||
return donor;
|
||||
}
|
||||
|
||||
static void
|
||||
HandleSafekeeperResponse(WalProposer *wp)
|
||||
{
|
||||
XLogRecPtr minQuorumLsn;
|
||||
XLogRecPtr minFlushLsn;
|
||||
XLogRecPtr candidateTruncateLsn;
|
||||
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
|
||||
|
||||
/*
|
||||
* Try to advance truncateLsn to minFlushLsn, which is the last record
|
||||
* flushed to all safekeepers. We must always start streaming from the
|
||||
* beginning of the record, which simplifies decoding on the far end.
|
||||
* Try to advance truncateLsn -- the last record flushed to all
|
||||
* safekeepers.
|
||||
*
|
||||
* Advanced truncateLsn should be not further than nearest commitLsn. This
|
||||
* prevents surprising violation of truncateLsn <= commitLsn invariant
|
||||
* which might occur because 1) truncateLsn can be advanced immediately
|
||||
* once chunk is broadcast to all safekeepers, and commitLsn generally
|
||||
* can't be advanced based on feedback from safekeeper who is still in the
|
||||
* previous epoch (similar to 'leader can't commit entries from previous
|
||||
* term' in Raft); 2) chunks we read from WAL and send are plain sheets of
|
||||
* bytes, but safekeepers ack only on record boundaries.
|
||||
* Advanced truncateLsn should be not higher than commitLsn. This prevents
|
||||
* surprising violation of truncateLsn <= commitLsn invariant which might
|
||||
* occur because commitLsn generally can't be advanced based on feedback
|
||||
* from safekeeper who is still in the previous epoch (similar to 'leader
|
||||
* can't commit entries from previous term' in Raft); 2)
|
||||
*/
|
||||
minFlushLsn = CalculateMinFlushLsn(wp);
|
||||
if (minFlushLsn > wp->truncateLsn)
|
||||
candidateTruncateLsn = CalculateMinFlushLsn(wp);
|
||||
candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
|
||||
if (candidateTruncateLsn > wp->truncateLsn)
|
||||
{
|
||||
wp->truncateLsn = minFlushLsn;
|
||||
|
||||
/*
|
||||
* Advance the replication slot to free up old WAL files. Note that
|
||||
* slot doesn't exist if we are in syncSafekeepers mode.
|
||||
*/
|
||||
wp->api.confirm_wal_streamed(wp, wp->truncateLsn);
|
||||
wp->truncateLsn = candidateTruncateLsn;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1713,7 +1735,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state),
|
||||
sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
@@ -1753,7 +1775,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
sk->port, FormatSafekeeperState(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1824,12 +1846,13 @@ static bool
|
||||
BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
uint32 events;
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
|
||||
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
|
||||
{
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
@@ -1841,9 +1864,15 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
|
||||
* If the new state will be waiting for events to happen, update the event
|
||||
* set to wait for those
|
||||
*/
|
||||
events = SafekeeperStateDesiredEvents(success_state);
|
||||
if (events)
|
||||
wp->api.update_event_set(sk, events);
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
/*
|
||||
* nwr_events is relevant only during SS_ACTIVE which doesn't use
|
||||
* BlockingWrite
|
||||
*/
|
||||
Assert(!nwr_events);
|
||||
if (sk_events)
|
||||
wp->api.update_event_set(sk, sk_events);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -1876,7 +1905,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
|
||||
return false;
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
@@ -1915,7 +1944,7 @@ AsyncFlush(Safekeeper *sk)
|
||||
return false;
|
||||
case -1:
|
||||
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||
wp->api.conn_error_message(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
@@ -1945,18 +1974,18 @@ CompareLsn(const void *a, const void *b)
|
||||
*
|
||||
* The strings are intended to be used as a prefix to "state", e.g.:
|
||||
*
|
||||
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
|
||||
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
|
||||
*
|
||||
* If this sort of phrasing doesn't fit the message, instead use something like:
|
||||
*
|
||||
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
|
||||
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
|
||||
*/
|
||||
static char *
|
||||
FormatSafekeeperState(SafekeeperState state)
|
||||
FormatSafekeeperState(Safekeeper *sk)
|
||||
{
|
||||
char *return_val = NULL;
|
||||
|
||||
switch (state)
|
||||
switch (sk->state)
|
||||
{
|
||||
case SS_OFFLINE:
|
||||
return_val = "offline";
|
||||
@@ -1984,7 +2013,18 @@ FormatSafekeeperState(SafekeeperState state)
|
||||
return_val = "idle";
|
||||
break;
|
||||
case SS_ACTIVE:
|
||||
return_val = "active";
|
||||
switch (sk->active_state)
|
||||
{
|
||||
case SS_ACTIVE_SEND:
|
||||
return_val = "active send";
|
||||
break;
|
||||
case SS_ACTIVE_READ_WAL:
|
||||
return_val = "active read WAL";
|
||||
break;
|
||||
case SS_ACTIVE_FLUSH:
|
||||
return_val = "active flush";
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1997,22 +2037,21 @@ FormatSafekeeperState(SafekeeperState state)
|
||||
static void
|
||||
AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
uint32 expected = SafekeeperStateDesiredEvents(sk->state);
|
||||
|
||||
/*
|
||||
* The events are in-line with what we're expecting, under two conditions:
|
||||
* (a) if we aren't expecting anything, `events` has no read- or
|
||||
* write-ready component. (b) if we are expecting something, there's
|
||||
* overlap (i.e. `events & expected != 0`)
|
||||
*/
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
uint32 expected;
|
||||
bool events_ok_for_state; /* long name so the `Assert` is more
|
||||
* clear later */
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
if (expected == WL_NO_EVENTS)
|
||||
events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
|
||||
else
|
||||
events_ok_for_state = ((events & expected) != 0);
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
/*
|
||||
* Without one more level of notify target indirection we have no way to
|
||||
* distinguish which socket woke up us, so just union expected events.
|
||||
*/
|
||||
expected = sk_events | nwr_events;
|
||||
events_ok_for_state = ((events & expected) != 0);
|
||||
|
||||
if (!events_ok_for_state)
|
||||
{
|
||||
@@ -2021,36 +2060,39 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
* and then an assertion that's guaranteed to fail.
|
||||
*/
|
||||
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
|
||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
|
||||
Assert(events_ok_for_state);
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns the set of events a safekeeper in this state should be waiting on
|
||||
/* Returns the set of events for both safekeeper (sk_events) and neon_walreader
|
||||
* (nwr_events) sockets a safekeeper in this state should be waiting on.
|
||||
*
|
||||
* This will return WL_NO_EVENTS (= 0) for some events. */
|
||||
static uint32
|
||||
SafekeeperStateDesiredEvents(SafekeeperState state)
|
||||
void
|
||||
SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events)
|
||||
{
|
||||
uint32 result = WL_NO_EVENTS;
|
||||
WalProposer *wp = sk->wp;
|
||||
|
||||
*nwr_events = 0; /* nwr_events is empty for most states */
|
||||
|
||||
/* If the state doesn't have a modifier, we can check the base state */
|
||||
switch (state)
|
||||
switch (sk->state)
|
||||
{
|
||||
/* Connecting states say what they want in the name */
|
||||
case SS_CONNECTING_READ:
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
return;
|
||||
case SS_CONNECTING_WRITE:
|
||||
result = WL_SOCKET_WRITEABLE;
|
||||
break;
|
||||
*sk_events = WL_SOCKET_WRITEABLE;
|
||||
return;
|
||||
|
||||
/* Reading states need the socket to be read-ready to continue */
|
||||
case SS_WAIT_EXEC_RESULT:
|
||||
case SS_HANDSHAKE_RECV:
|
||||
case SS_WAIT_VERDICT:
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
return;
|
||||
|
||||
/*
|
||||
* Idle states use read-readiness as a sign that the connection
|
||||
@@ -2058,32 +2100,66 @@ SafekeeperStateDesiredEvents(SafekeeperState state)
|
||||
*/
|
||||
case SS_VOTING:
|
||||
case SS_IDLE:
|
||||
result = WL_SOCKET_READABLE;
|
||||
break;
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
return;
|
||||
|
||||
/*
|
||||
* Flush states require write-ready for flushing. Active state
|
||||
* does both reading and writing.
|
||||
*
|
||||
* TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
|
||||
* should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
|
||||
*/
|
||||
case SS_SEND_ELECTED_FLUSH:
|
||||
*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
|
||||
return;
|
||||
|
||||
case SS_ACTIVE:
|
||||
result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
|
||||
break;
|
||||
switch (sk->active_state)
|
||||
{
|
||||
/*
|
||||
* Everything is sent; we just wait for sk responses and
|
||||
* latch.
|
||||
*
|
||||
* Note: this assumes we send all available WAL to
|
||||
* safekeeper in one wakeup (unless it blocks). Otherwise
|
||||
* we would want WL_SOCKET_WRITEABLE here to finish the
|
||||
* work.
|
||||
*/
|
||||
case SS_ACTIVE_SEND:
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
/* c.f. walprop_pg_active_state_update_event_set */
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
if (wp->api.wal_reader_events(sk))
|
||||
*nwr_events = WL_SOCKET_CLOSED;
|
||||
#endif /* on PG 14 nwr_events remains 0 */
|
||||
return;
|
||||
|
||||
/*
|
||||
* Waiting for neon_walreader socket, but we still read
|
||||
* responses from sk socket.
|
||||
*/
|
||||
case SS_ACTIVE_READ_WAL:
|
||||
*sk_events = WL_SOCKET_READABLE;
|
||||
*nwr_events = wp->api.wal_reader_events(sk);
|
||||
return;
|
||||
|
||||
/*
|
||||
* Need to flush the sk socket, so ignore neon_walreader
|
||||
* one and set write interest on sk.
|
||||
*/
|
||||
case SS_ACTIVE_FLUSH:
|
||||
*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
/* c.f. walprop_pg_active_state_update_event_set */
|
||||
if (wp->api.wal_reader_events(sk))
|
||||
*nwr_events = WL_SOCKET_CLOSED;
|
||||
#endif /* on PG 14 nwr_events remains 0 */
|
||||
return;
|
||||
}
|
||||
return;
|
||||
|
||||
/* The offline state expects no events. */
|
||||
case SS_OFFLINE:
|
||||
result = WL_NO_EVENTS;
|
||||
break;
|
||||
*sk_events = 0;
|
||||
return;
|
||||
|
||||
default:
|
||||
Assert(false);
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Returns a human-readable string corresponding to the event set
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
#include "replication/walreceiver.h"
|
||||
#include "utils/uuid.h"
|
||||
|
||||
#include "libpqwalproposer.h"
|
||||
#include "neon_walreader.h"
|
||||
|
||||
#define SK_MAGIC 0xCafeCeefu
|
||||
#define SK_PROTOCOL_VERSION 2
|
||||
|
||||
@@ -20,43 +23,9 @@
|
||||
*/
|
||||
#define WL_NO_EVENTS 0
|
||||
|
||||
struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */
|
||||
struct WalProposerConn; /* Defined in libpqwalproposer.h */
|
||||
typedef struct WalProposerConn WalProposerConn;
|
||||
|
||||
/* Possible return values from ReadPGAsync */
|
||||
typedef enum
|
||||
{
|
||||
/* The full read was successful. buf now points to the data */
|
||||
PG_ASYNC_READ_SUCCESS,
|
||||
|
||||
/*
|
||||
* The read is ongoing. Wait until the connection is read-ready, then try
|
||||
* again.
|
||||
*/
|
||||
PG_ASYNC_READ_TRY_AGAIN,
|
||||
/* Reading failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_READ_FAIL,
|
||||
} PGAsyncReadResult;
|
||||
|
||||
/* Possible return values from WritePGAsync */
|
||||
typedef enum
|
||||
{
|
||||
/* The write fully completed */
|
||||
PG_ASYNC_WRITE_SUCCESS,
|
||||
|
||||
/*
|
||||
* The write started, but you'll need to call PQflush some more times to
|
||||
* finish it off. We just tried, so it's best to wait until the connection
|
||||
* is read- or write-ready to try again.
|
||||
*
|
||||
* If it becomes read-ready, call PQconsumeInput and flush again. If it
|
||||
* becomes write-ready, just call PQflush.
|
||||
*/
|
||||
PG_ASYNC_WRITE_TRY_FLUSH,
|
||||
/* Writing failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_WRITE_FAIL,
|
||||
} PGAsyncWriteResult;
|
||||
|
||||
/*
|
||||
* WAL safekeeper state, which is used to wait for some event.
|
||||
*
|
||||
@@ -133,6 +102,40 @@ typedef enum
|
||||
SS_ACTIVE,
|
||||
} SafekeeperState;
|
||||
|
||||
/*
|
||||
* Sending WAL substates of SS_ACTIVE.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
/*
|
||||
* We are ready to send more WAL, waiting for latch set to learn about
|
||||
* more WAL becoming available (or just a timeout to send heartbeat).
|
||||
*/
|
||||
SS_ACTIVE_SEND,
|
||||
|
||||
/*
|
||||
* Polling neon_walreader to receive chunk of WAL (probably remotely) to
|
||||
* send to this safekeeper.
|
||||
*
|
||||
* Note: socket management is done completely inside walproposer_pg for
|
||||
* simplicity, and thus simulation doesn't test it. Which is fine as
|
||||
* simulation is mainly aimed at consensus checks, not waiteventset
|
||||
* management.
|
||||
*
|
||||
* Also, while in this state we don't touch safekeeper socket, so in
|
||||
* theory it might close connection as inactive. This can be addressed if
|
||||
* needed; however, while fetching WAL we should regularly send it, so the
|
||||
* problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
|
||||
* walreader socket), but similarly shouldn't be a problem.
|
||||
*/
|
||||
SS_ACTIVE_READ_WAL,
|
||||
|
||||
/*
|
||||
* Waiting for write readiness to flush the socket.
|
||||
*/
|
||||
SS_ACTIVE_FLUSH,
|
||||
} SafekeeperActiveState;
|
||||
|
||||
/* Consensus logical timestamp. */
|
||||
typedef uint64 term_t;
|
||||
|
||||
@@ -341,12 +344,11 @@ typedef struct Safekeeper
|
||||
*/
|
||||
XLogRecPtr startStreamingAt;
|
||||
|
||||
bool flushWrite; /* set to true if we need to call AsyncFlush,*
|
||||
* to flush pending messages */
|
||||
XLogRecPtr streamingAt; /* current streaming position */
|
||||
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
|
||||
|
||||
SafekeeperState state; /* safekeeper state machine state */
|
||||
SafekeeperActiveState active_state;
|
||||
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
|
||||
AcceptorGreeting greetResponse; /* acceptor greeting */
|
||||
VoteResponse voteResponse; /* the vote */
|
||||
@@ -367,12 +369,27 @@ typedef struct Safekeeper
|
||||
/*
|
||||
* WAL reader, allocated for each safekeeper.
|
||||
*/
|
||||
XLogReaderState *xlogreader;
|
||||
NeonWALReader *xlogreader;
|
||||
|
||||
/*
|
||||
* Position in wait event set. Equal to -1 if no event
|
||||
*/
|
||||
int eventPos;
|
||||
|
||||
/*
|
||||
* Neon WAL reader position in wait event set, or -1 if no socket. Note
|
||||
* that event must be removed not only on error/failure, but also on
|
||||
* successful *local* read, as next read might again be remote, but with
|
||||
* different socket.
|
||||
*/
|
||||
int nwrEventPos;
|
||||
|
||||
/*
|
||||
* Per libpq docs, during connection establishment socket might change,
|
||||
* remember here if it is stable to avoid readding to the event set if
|
||||
* possible. Must be reset whenever nwr event is deleted.
|
||||
*/
|
||||
bool nwrConnEstablished;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -401,31 +418,6 @@ typedef enum
|
||||
*/
|
||||
} WalProposerConnectPollStatusType;
|
||||
|
||||
/* Re-exported and modified ExecStatusType */
|
||||
typedef enum
|
||||
{
|
||||
/* We received a single CopyBoth result */
|
||||
WP_EXEC_SUCCESS_COPYBOTH,
|
||||
|
||||
/*
|
||||
* Any success result other than a single CopyBoth was received. The
|
||||
* specifics of the result were already logged, but it may be useful to
|
||||
* provide an error message indicating which safekeeper messed up.
|
||||
*
|
||||
* Do not expect PQerrorMessage to be appropriately set.
|
||||
*/
|
||||
WP_EXEC_UNEXPECTED_SUCCESS,
|
||||
|
||||
/*
|
||||
* No result available at this time. Wait until read-ready, then call
|
||||
* again. Internally, this is returned when PQisBusy indicates that
|
||||
* PQgetResult would block.
|
||||
*/
|
||||
WP_EXEC_NEEDS_INPUT,
|
||||
/* Catch-all failure. Check PQerrorMessage. */
|
||||
WP_EXEC_FAILED,
|
||||
} WalProposerExecStatusType;
|
||||
|
||||
/* Re-exported ConnStatusType */
|
||||
typedef enum
|
||||
{
|
||||
@@ -486,7 +478,7 @@ typedef struct walproposer_api
|
||||
/* Flush buffer to the network, aka PQflush. */
|
||||
int (*conn_flush) (Safekeeper *sk);
|
||||
|
||||
/* Close the connection, aka PQfinish. */
|
||||
/* Reset sk state: close pq connection, deallocate xlogreader. */
|
||||
void (*conn_finish) (Safekeeper *sk);
|
||||
|
||||
/*
|
||||
@@ -503,17 +495,20 @@ typedef struct walproposer_api
|
||||
/* Blocking CopyData write, aka PQputCopyData + PQflush. */
|
||||
bool (*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);
|
||||
|
||||
/* Download WAL from startpos to endpos and make it available locally. */
|
||||
bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
|
||||
/* Read WAL from disk to buf. */
|
||||
void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
|
||||
/*
|
||||
* Download WAL before basebackup for logical walsenders from sk, if
|
||||
* needed
|
||||
*/
|
||||
bool (*recovery_download) (WalProposer *wp, Safekeeper *sk);
|
||||
|
||||
/* Allocate WAL reader. */
|
||||
void (*wal_reader_allocate) (Safekeeper *sk);
|
||||
|
||||
/* Deallocate event set. */
|
||||
void (*free_event_set) (WalProposer *wp);
|
||||
/* Read WAL from disk to buf. */
|
||||
NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg);
|
||||
|
||||
/* Returns events to be awaited on WAL reader, if any. */
|
||||
uint32 (*wal_reader_events) (Safekeeper *sk);
|
||||
|
||||
/* Initialize event set. */
|
||||
void (*init_event_set) (WalProposer *wp);
|
||||
@@ -521,9 +516,15 @@ typedef struct walproposer_api
|
||||
/* Update events for an existing safekeeper connection. */
|
||||
void (*update_event_set) (Safekeeper *sk, uint32 events);
|
||||
|
||||
/* Configure wait event set for yield in SS_ACTIVE. */
|
||||
void (*active_state_update_event_set) (Safekeeper *sk);
|
||||
|
||||
/* Add a new safekeeper connection to the event set. */
|
||||
void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
|
||||
|
||||
/* Remove safekeeper connection from event set */
|
||||
void (*rm_safekeeper_event_set) (Safekeeper *sk);
|
||||
|
||||
/*
|
||||
* Wait until some event happens: - timeout is reached - socket event for
|
||||
* safekeeper connection - new WAL is available
|
||||
@@ -556,26 +557,12 @@ typedef struct walproposer_api
|
||||
*/
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
|
||||
|
||||
/*
|
||||
* Called on peer_horizon_lsn updates. Used to advance replication slot
|
||||
* and to free up disk space by deleting unnecessary WAL.
|
||||
*/
|
||||
void (*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
|
||||
|
||||
/*
|
||||
* Write a log message to the internal log processor. This is used only
|
||||
* when walproposer is compiled as a library. Otherwise, all logging is
|
||||
* handled by elog().
|
||||
*/
|
||||
void (*log_internal) (WalProposer *wp, int level, const char *line);
|
||||
|
||||
/*
|
||||
* Called right after the proposer was elected, but before it started
|
||||
* recovery and sent ProposerElected message to the safekeepers.
|
||||
*
|
||||
* Used by logical replication to update truncateLsn.
|
||||
*/
|
||||
void (*after_election) (WalProposer *wp);
|
||||
} walproposer_api;
|
||||
|
||||
/*
|
||||
@@ -709,6 +696,13 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
|
||||
extern void WalProposerPoll(WalProposer *wp);
|
||||
extern void WalProposerFree(WalProposer *wp);
|
||||
|
||||
/*
|
||||
* WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
|
||||
* recreate set from scratch, hence the export.
|
||||
*/
|
||||
extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
|
||||
extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
|
||||
|
||||
|
||||
#define WPEVENT 1337 /* special log level for walproposer internal
|
||||
* events */
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlogdefs.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "access/xloginsert.h"
|
||||
@@ -43,14 +44,19 @@
|
||||
#include "utils/ps_status.h"
|
||||
#include "utils/timestamp.h"
|
||||
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
#include "libpq-fe.h"
|
||||
|
||||
#include "libpqwalproposer.h"
|
||||
#include "neon.h"
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */
|
||||
#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender*
|
||||
* message header */
|
||||
|
||||
#define MB ((XLogRecPtr)1024 * 1024)
|
||||
|
||||
#define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
|
||||
|
||||
char *wal_acceptors_list = "";
|
||||
@@ -91,6 +97,12 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
|
||||
static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
|
||||
static void XLogWalPropClose(XLogRecPtr recptr);
|
||||
|
||||
static void add_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void update_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
|
||||
|
||||
static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
|
||||
|
||||
static void
|
||||
init_walprop_config(bool syncSafekeepers)
|
||||
{
|
||||
@@ -214,7 +226,6 @@ backpressure_lag_impl(void)
|
||||
XLogRecPtr myFlushLsn = GetFlushRecPtr();
|
||||
#endif
|
||||
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
|
||||
#define MB ((XLogRecPtr)1024 * 1024)
|
||||
|
||||
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
LSN_FORMAT_ARGS(myFlushLsn),
|
||||
@@ -541,14 +552,6 @@ walprop_pg_load_libpqwalreceiver(void)
|
||||
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
}
|
||||
|
||||
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
|
||||
struct WalProposerConn
|
||||
{
|
||||
PGconn *pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received data from walprop_async_read */
|
||||
};
|
||||
|
||||
/* Helper function */
|
||||
static bool
|
||||
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
@@ -586,16 +589,17 @@ walprop_status(Safekeeper *sk)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_connect_start(Safekeeper *sk)
|
||||
WalProposerConn *
|
||||
libpqwp_connect_start(char *conninfo)
|
||||
{
|
||||
|
||||
PGconn *pg_conn;
|
||||
WalProposerConn *conn;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
char *password = neon_auth_token;
|
||||
|
||||
Assert(sk->conn == NULL);
|
||||
|
||||
/*
|
||||
* Connect using the given connection string. If the NEON_AUTH_TOKEN
|
||||
@@ -614,7 +618,7 @@ walprop_connect_start(Safekeeper *sk)
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
values[n] = sk->conninfo;
|
||||
values[n] = conninfo;
|
||||
n++;
|
||||
keywords[n] = NULL;
|
||||
values[n] = NULL;
|
||||
@@ -635,11 +639,20 @@ walprop_connect_start(Safekeeper *sk)
|
||||
* palloc will exit on failure though, so there's not much we could do if
|
||||
* it *did* fail.
|
||||
*/
|
||||
sk->conn = palloc(sizeof(WalProposerConn));
|
||||
sk->conn->pg_conn = pg_conn;
|
||||
sk->conn->is_nonblocking = false; /* connections always start in
|
||||
* blocking mode */
|
||||
sk->conn->recvbuf = NULL;
|
||||
conn = palloc(sizeof(WalProposerConn));
|
||||
conn->pg_conn = pg_conn;
|
||||
conn->is_nonblocking = false; /* connections always start in blocking
|
||||
* mode */
|
||||
conn->recvbuf = NULL;
|
||||
return conn;
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_connect_start(Safekeeper *sk)
|
||||
{
|
||||
Assert(sk->conn == NULL);
|
||||
sk->conn = libpqwp_connect_start(sk->conninfo);
|
||||
|
||||
}
|
||||
|
||||
static WalProposerConnectPollStatusType
|
||||
@@ -683,26 +696,33 @@ walprop_connect_poll(Safekeeper *sk)
|
||||
return return_val;
|
||||
}
|
||||
|
||||
static bool
|
||||
walprop_send_query(Safekeeper *sk, char *query)
|
||||
extern bool
|
||||
libpqwp_send_query(WalProposerConn *conn, char *query)
|
||||
{
|
||||
/*
|
||||
* We need to be in blocking mode for sending the query to run without
|
||||
* requiring a call to PQflush
|
||||
*/
|
||||
if (!ensure_nonblocking_status(sk->conn, false))
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
return false;
|
||||
|
||||
/* PQsendQuery returns 1 on success, 0 on failure */
|
||||
if (!PQsendQuery(sk->conn->pg_conn, query))
|
||||
if (!PQsendQuery(conn->pg_conn, query))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static WalProposerExecStatusType
|
||||
walprop_get_query_result(Safekeeper *sk)
|
||||
static bool
|
||||
walprop_send_query(Safekeeper *sk, char *query)
|
||||
{
|
||||
return libpqwp_send_query(sk->conn, query);
|
||||
}
|
||||
|
||||
WalProposerExecStatusType
|
||||
libpqwp_get_query_result(WalProposerConn *conn)
|
||||
{
|
||||
|
||||
PGresult *result;
|
||||
WalProposerExecStatusType return_val;
|
||||
|
||||
@@ -710,14 +730,14 @@ walprop_get_query_result(Safekeeper *sk)
|
||||
char *unexpected_success = NULL;
|
||||
|
||||
/* Consume any input that we might be missing */
|
||||
if (!PQconsumeInput(sk->conn->pg_conn))
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
return WP_EXEC_FAILED;
|
||||
|
||||
if (PQisBusy(sk->conn->pg_conn))
|
||||
if (PQisBusy(conn->pg_conn))
|
||||
return WP_EXEC_NEEDS_INPUT;
|
||||
|
||||
|
||||
result = PQgetResult(sk->conn->pg_conn);
|
||||
result = PQgetResult(conn->pg_conn);
|
||||
|
||||
/*
|
||||
* PQgetResult returns NULL only if getting the result was successful &
|
||||
@@ -778,6 +798,12 @@ walprop_get_query_result(Safekeeper *sk)
|
||||
return return_val;
|
||||
}
|
||||
|
||||
static WalProposerExecStatusType
|
||||
walprop_get_query_result(Safekeeper *sk)
|
||||
{
|
||||
return libpqwp_get_query_result(sk->conn);
|
||||
}
|
||||
|
||||
static pgsocket
|
||||
walprop_socket(Safekeeper *sk)
|
||||
{
|
||||
@@ -790,42 +816,31 @@ walprop_flush(Safekeeper *sk)
|
||||
return (PQflush(sk->conn->pg_conn));
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_finish(Safekeeper *sk)
|
||||
/* Like libpqrcv_receive. *buf is valid until the next call. */
|
||||
PGAsyncReadResult
|
||||
libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
{
|
||||
if (!sk->conn)
|
||||
return;
|
||||
int rawlen;
|
||||
|
||||
if (sk->conn->recvbuf != NULL)
|
||||
PQfreemem(sk->conn->recvbuf);
|
||||
PQfinish(sk->conn->pg_conn);
|
||||
pfree(sk->conn);
|
||||
sk->conn = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
{
|
||||
int result;
|
||||
|
||||
if (sk->conn->recvbuf != NULL)
|
||||
if (conn->recvbuf != NULL)
|
||||
{
|
||||
PQfreemem(sk->conn->recvbuf);
|
||||
sk->conn->recvbuf = NULL;
|
||||
PQfreemem(conn->recvbuf);
|
||||
conn->recvbuf = NULL;
|
||||
}
|
||||
|
||||
/* Call PQconsumeInput so that we have the data we need */
|
||||
if (!PQconsumeInput(sk->conn->pg_conn))
|
||||
/* Try to receive a CopyData message */
|
||||
rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
|
||||
if (rawlen == 0)
|
||||
{
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
/* Try consuming some data. */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
{
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
}
|
||||
/* Now that we've consumed some input, try again */
|
||||
rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -839,7 +854,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
* sometimes be triggered by the server returning an ErrorResponse (which
|
||||
* also happens to have the effect that the copy is done).
|
||||
*/
|
||||
switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
|
||||
switch (rawlen)
|
||||
{
|
||||
case 0:
|
||||
*amount = 0;
|
||||
@@ -854,7 +869,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
* We can check PQgetResult to make sure that the server
|
||||
* failed; it'll always result in PGRES_FATAL_ERROR
|
||||
*/
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
|
||||
|
||||
if (status != PGRES_FATAL_ERROR)
|
||||
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
||||
@@ -874,12 +889,24 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
default:
|
||||
/* Positive values indicate the size of the returned result */
|
||||
*amount = result;
|
||||
*buf = sk->conn->recvbuf;
|
||||
*amount = rawlen;
|
||||
*buf = conn->recvbuf;
|
||||
return PG_ASYNC_READ_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
{
|
||||
return libpqwp_async_read(sk->conn, buf, amount);
|
||||
}
|
||||
|
||||
static PGAsyncWriteResult
|
||||
walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
{
|
||||
@@ -962,6 +989,33 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
libpqwp_disconnect(WalProposerConn *conn)
|
||||
{
|
||||
if (conn->recvbuf != NULL)
|
||||
PQfreemem(conn->recvbuf);
|
||||
PQfinish(conn->pg_conn);
|
||||
pfree(conn);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_finish(Safekeeper *sk)
|
||||
{
|
||||
if (sk->conn)
|
||||
{
|
||||
libpqwp_disconnect(sk->conn);
|
||||
sk->conn = NULL;
|
||||
}
|
||||
|
||||
/* free xlogreader */
|
||||
if (sk->xlogreader)
|
||||
{
|
||||
NeonWALReaderFree(sk->xlogreader);
|
||||
sk->xlogreader = NULL;
|
||||
}
|
||||
rm_safekeeper_event_set(sk, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Subscribe for new WAL and stream it in the loop to safekeepers.
|
||||
*
|
||||
@@ -1165,16 +1219,38 @@ XLogBroadcastWalProposer(WalProposer *wp)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive WAL from most advanced safekeeper
|
||||
*/
|
||||
/* Download WAL before basebackup for logical walsenders from sk, if needed */
|
||||
static bool
|
||||
WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
|
||||
WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
||||
{
|
||||
char *err;
|
||||
WalReceiverConn *wrconn;
|
||||
WalRcvStreamOptions options;
|
||||
char conninfo[MAXCONNINFO];
|
||||
TimeLineID timeline;
|
||||
XLogRecPtr startpos;
|
||||
XLogRecPtr endpos;
|
||||
uint64 download_range_mb;
|
||||
|
||||
startpos = GetLogRepRestartLSN(wp);
|
||||
if (startpos == InvalidXLogRecPtr)
|
||||
return true; /* recovery not needed */
|
||||
endpos = wp->propEpochStartLsn;
|
||||
|
||||
/*
|
||||
* If we need to download more than a max_slot_wal_keep_size, cap to it to
|
||||
* avoid risk of exploding pg_wal. Logical replication won't work until
|
||||
* recreated, but at least compute would start; this also follows
|
||||
* max_slot_wal_keep_size semantics.
|
||||
*/
|
||||
download_range_mb = (endpos - startpos) / 1024 / 1024;
|
||||
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
|
||||
{
|
||||
startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
|
||||
walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
|
||||
LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
|
||||
}
|
||||
timeline = wp->greetRequest.timeline;
|
||||
|
||||
if (!neon_auth_token)
|
||||
{
|
||||
@@ -1204,7 +1280,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
|
||||
return false;
|
||||
}
|
||||
elog(LOG,
|
||||
"start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
|
||||
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
|
||||
"%d",
|
||||
sk->host, sk->port, (uint32) (startpos >> 32),
|
||||
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
|
||||
@@ -1400,30 +1476,56 @@ XLogWalPropClose(XLogRecPtr recptr)
|
||||
walpropFile = -1;
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
|
||||
{
|
||||
WALReadError errinfo;
|
||||
|
||||
if (!WALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id(),
|
||||
&errinfo))
|
||||
{
|
||||
WALReadRaiseError(&errinfo);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
||||
{
|
||||
sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
|
||||
char log_prefix[64];
|
||||
|
||||
snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
|
||||
Assert(!sk->xlogreader);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
|
||||
if (sk->xlogreader == NULL)
|
||||
elog(FATAL, "Failed to allocate xlog reader");
|
||||
}
|
||||
|
||||
static NeonWALReadResult
|
||||
walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg)
|
||||
{
|
||||
NeonWALReadResult res;
|
||||
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
{
|
||||
/*
|
||||
* If we have the socket subscribed, but walreader doesn't need any
|
||||
* events, it must mean that remote connection just closed hoping to
|
||||
* do next read locally. Remove the socket then. It is important to do
|
||||
* as otherwise next read might open another connection and we won't
|
||||
* be able to distinguish whether we have correct socket added in wait
|
||||
* event set.
|
||||
*/
|
||||
if (NeonWALReaderEvents(sk->xlogreader) == 0)
|
||||
rm_safekeeper_event_set(sk, false);
|
||||
}
|
||||
else if (res == NEON_WALREAD_ERROR)
|
||||
{
|
||||
*errmsg = NeonWALReaderErrMsg(sk->xlogreader);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static uint32
|
||||
walprop_pg_wal_reader_events(Safekeeper *sk)
|
||||
{
|
||||
return NeonWALReaderEvents(sk->xlogreader);
|
||||
}
|
||||
|
||||
static WaitEventSet *waitEvents;
|
||||
|
||||
static void
|
||||
@@ -1438,6 +1540,8 @@ walprop_pg_free_event_set(WalProposer *wp)
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
wp->safekeeper[i].eventPos = -1;
|
||||
wp->safekeeper[i].nwrEventPos = -1;
|
||||
wp->safekeeper[i].nwrConnEstablished = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1447,11 +1551,37 @@ walprop_pg_init_event_set(WalProposer *wp)
|
||||
if (waitEvents)
|
||||
elog(FATAL, "double-initialization of event set");
|
||||
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
|
||||
/* for each sk, we have socket plus potentially socket for neon walreader */
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
|
||||
AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
wp->safekeeper[i].eventPos = -1;
|
||||
wp->safekeeper[i].nwrEventPos = -1;
|
||||
wp->safekeeper[i].nwrConnEstablished = false;
|
||||
}
|
||||
}
|
||||
|
||||
/* add safekeeper socket to wait event set */
|
||||
static void
|
||||
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
Assert(sk->eventPos == -1);
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
|
||||
}
|
||||
|
||||
/* add neon wal reader socket to wait event set */
|
||||
static void
|
||||
add_nwr_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
Assert(sk->nwrEventPos == -1);
|
||||
sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
|
||||
sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
|
||||
elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -1463,10 +1593,144 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
|
||||
ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update neon_walreader event.
|
||||
* Can be called when nwr socket doesn't exist, does nothing in this case.
|
||||
*/
|
||||
static void
|
||||
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
|
||||
update_nwr_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
|
||||
/* eventPos = -1 when we don't have an event */
|
||||
if (sk->nwrEventPos != -1)
|
||||
ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
walprop_pg_active_state_update_event_set(Safekeeper *sk)
|
||||
{
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
|
||||
Assert(sk->state == SS_ACTIVE);
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
/*
|
||||
* If we need to wait for neon_walreader, ensure we have up to date socket
|
||||
* in the wait event set.
|
||||
*/
|
||||
if (sk->active_state == SS_ACTIVE_READ_WAL)
|
||||
{
|
||||
/*
|
||||
* If conn is established and socket is thus stable, update the event
|
||||
* directly; otherwise re-add it.
|
||||
*/
|
||||
if (sk->nwrConnEstablished)
|
||||
{
|
||||
Assert(sk->nwrEventPos != -1);
|
||||
update_nwr_event_set(sk, nwr_events);
|
||||
}
|
||||
else
|
||||
{
|
||||
rm_safekeeper_event_set(sk, false);
|
||||
add_nwr_event_set(sk, nwr_events);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Hack: we should always set 0 here, but for random reasons
|
||||
* WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
|
||||
* some event. Since there is also no way to remove socket except
|
||||
* reconstructing the whole set, SafekeeperStateDesiredEvents instead
|
||||
* gives WL_SOCKET_CLOSED if socket exists. We never expect it to
|
||||
* trigger.
|
||||
*
|
||||
* On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event
|
||||
* removal.
|
||||
*/
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
|
||||
update_nwr_event_set(sk, WL_SOCKET_CLOSED);
|
||||
#else /* pg 14 */
|
||||
rm_safekeeper_event_set(sk, false);
|
||||
#endif
|
||||
}
|
||||
walprop_pg_update_event_set(sk, sk_events);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
|
||||
{
|
||||
rm_safekeeper_event_set(to_remove, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* A hacky way to remove single event from the event set. Can be called if event
|
||||
* doesn't exist, does nothing in this case.
|
||||
*
|
||||
* Note: Internally, this completely reconstructs the event set. It should be
|
||||
* avoided if possible.
|
||||
*
|
||||
* If is_sk is true, socket of connection to safekeeper is removed; otherwise
|
||||
* socket of neon_walreader.
|
||||
*/
|
||||
static void
|
||||
rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
|
||||
{
|
||||
WalProposer *wp = to_remove->wp;
|
||||
|
||||
elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
|
||||
to_remove->host, to_remove->port, is_sk);
|
||||
|
||||
/*
|
||||
* Shortpath for exiting if have nothing to do. We never call this
|
||||
* function with safekeeper socket not existing, but do that with neon
|
||||
* walreader socket.
|
||||
*/
|
||||
if ((is_sk && to_remove->eventPos == -1) ||
|
||||
(!is_sk && to_remove->nwrEventPos == -1))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/* Remove the existing event set, assign sk->eventPos = -1 */
|
||||
walprop_pg_free_event_set(wp);
|
||||
|
||||
/* Re-initialize it without adding any safekeeper events */
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
/*
|
||||
* loop through the existing safekeepers. If they aren't the one we're
|
||||
* removing, and if they have a socket we can use, re-add the applicable
|
||||
* events.
|
||||
*/
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
/*
|
||||
* If this safekeeper isn't offline, add events for it, except for the
|
||||
* event requested to remove.
|
||||
*/
|
||||
if (sk->state != SS_OFFLINE)
|
||||
{
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
if (sk != to_remove || !is_sk)
|
||||
{
|
||||
/* will set sk->eventPos */
|
||||
wp->api.add_safekeeper_event_set(sk, sk_events);
|
||||
}
|
||||
if ((sk != to_remove || is_sk) && nwr_events)
|
||||
{
|
||||
add_nwr_event_set(sk, nwr_events);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -1484,8 +1748,8 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
|
||||
ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);
|
||||
|
||||
/*
|
||||
* Now that we prepared the condvar, check flush ptr again -- it might have
|
||||
* changed before we subscribed to cv so we missed the wakeup.
|
||||
* Now that we prepared the condvar, check flush ptr again -- it might
|
||||
* have changed before we subscribed to cv so we missed the wakeup.
|
||||
*
|
||||
* Do that only when we're interested in new WAL: without sync-safekeepers
|
||||
* and if election already passed.
|
||||
@@ -1548,7 +1812,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
|
||||
}
|
||||
|
||||
/*
|
||||
* Get PageserverFeedback fields from the most advanced safekeeper
|
||||
* Choose most advanced PageserverFeedback and set it to *rf.
|
||||
*/
|
||||
static void
|
||||
GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
|
||||
@@ -1578,8 +1842,6 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
|
||||
rf->replytime);
|
||||
|
||||
replication_feedback_set(rf);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1619,63 +1881,69 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
|
||||
hs->catalog_xmin = InvalidFullTransactionId;
|
||||
}
|
||||
|
||||
/*
|
||||
* Based on commitLsn and safekeeper responses including pageserver feedback,
|
||||
* 1) Propagate cluster size received from ps to ensure the limit.
|
||||
* 2) Propagate pageserver LSN positions to ensure backpressure limits.
|
||||
* 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters).
|
||||
* 4) Propagate hot standby feedback.
|
||||
*
|
||||
* None of that is functional in sync-safekeepers.
|
||||
*/
|
||||
static void
|
||||
walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
{
|
||||
HotStandbyFeedback hsFeedback;
|
||||
XLogRecPtr diskConsistentLsn;
|
||||
XLogRecPtr oldDiskConsistentLsn;
|
||||
|
||||
diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
|
||||
if (wp->config->syncSafekeepers)
|
||||
return;
|
||||
|
||||
if (!wp->config->syncSafekeepers)
|
||||
oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
|
||||
|
||||
/* Get PageserverFeedback fields from the most advanced safekeeper */
|
||||
GetLatestNeonFeedback(&quorumFeedback.rf, wp);
|
||||
replication_feedback_set(&quorumFeedback.rf);
|
||||
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
|
||||
|
||||
if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
{
|
||||
/* Get PageserverFeedback fields from the most advanced safekeeper */
|
||||
GetLatestNeonFeedback(&quorumFeedback.rf, wp);
|
||||
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
|
||||
}
|
||||
|
||||
if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
{
|
||||
|
||||
if (commitLsn > quorumFeedback.flushLsn)
|
||||
quorumFeedback.flushLsn = commitLsn;
|
||||
|
||||
/* advance the replication slot */
|
||||
if (!wp->config->syncSafekeepers)
|
||||
ProcessStandbyReply(
|
||||
/* write_lsn - This is what durably stored in WAL service. */
|
||||
quorumFeedback.flushLsn,
|
||||
/* flush_lsn - This is what durably stored in WAL service. */
|
||||
quorumFeedback.flushLsn,
|
||||
/*
|
||||
* Advance the replication slot to commitLsn. WAL before it is
|
||||
* hardened and will be fetched from one of safekeepers by
|
||||
* neon_walreader if needed.
|
||||
*
|
||||
* Also wakes up syncrep waiters.
|
||||
*/
|
||||
ProcessStandbyReply(
|
||||
/* write_lsn - This is what durably stored in WAL service. */
|
||||
quorumFeedback.flushLsn,
|
||||
/* flush_lsn - This is what durably stored in WAL service. */
|
||||
quorumFeedback.flushLsn,
|
||||
|
||||
/*
|
||||
* apply_lsn - This is what processed and durably saved at*
|
||||
* pageserver.
|
||||
*/
|
||||
quorumFeedback.rf.disk_consistent_lsn,
|
||||
walprop_pg_get_current_timestamp(wp), false);
|
||||
/*
|
||||
* apply_lsn - This is what processed and durably saved at*
|
||||
* pageserver.
|
||||
*/
|
||||
quorumFeedback.rf.disk_consistent_lsn,
|
||||
walprop_pg_get_current_timestamp(wp), false);
|
||||
}
|
||||
|
||||
CombineHotStanbyFeedbacks(&hsFeedback, wp);
|
||||
if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
|
||||
{
|
||||
quorumFeedback.hs = hsFeedback;
|
||||
if (!wp->config->syncSafekeepers)
|
||||
ProcessStandbyHSFeedback(hsFeedback.ts,
|
||||
XidFromFullTransactionId(hsFeedback.xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.xmin),
|
||||
XidFromFullTransactionId(hsFeedback.catalog_xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
ProcessStandbyHSFeedback(hsFeedback.ts,
|
||||
XidFromFullTransactionId(hsFeedback.xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.xmin),
|
||||
XidFromFullTransactionId(hsFeedback.catalog_xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
|
||||
{
|
||||
if (MyReplicationSlot)
|
||||
PhysicalConfirmReceivedLocation(lsn);
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
walprop_pg_get_redo_start_lsn(WalProposer *wp)
|
||||
{
|
||||
@@ -1694,15 +1962,15 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
|
||||
elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_after_election(WalProposer *wp)
|
||||
static XLogRecPtr
|
||||
GetLogRepRestartLSN(WalProposer *wp)
|
||||
{
|
||||
FILE *f;
|
||||
XLogRecPtr lrRestartLsn;
|
||||
XLogRecPtr lrRestartLsn = InvalidXLogRecPtr;
|
||||
|
||||
/* We don't need to do anything in syncSafekeepers mode. */
|
||||
if (wp->config->syncSafekeepers)
|
||||
return;
|
||||
return InvalidXLogRecPtr;
|
||||
|
||||
/*
|
||||
* If there are active logical replication subscription we need to provide
|
||||
@@ -1710,22 +1978,40 @@ walprop_pg_after_election(WalProposer *wp)
|
||||
* replication slots.
|
||||
*/
|
||||
f = fopen("restart.lsn", "rb");
|
||||
if (f != NULL && !wp->config->syncSafekeepers)
|
||||
if (f != NULL)
|
||||
{
|
||||
fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
|
||||
size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
|
||||
|
||||
fclose(f);
|
||||
if (lrRestartLsn != InvalidXLogRecPtr)
|
||||
if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
uint64 download_range_mb;
|
||||
|
||||
elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
|
||||
/*
|
||||
* If we need to download more than a max_slot_wal_keep_size,
|
||||
* don't do it to avoid risk of exploding pg_wal. Logical
|
||||
* replication won't work until recreated, but at least compute
|
||||
* would start; this also follows max_slot_wal_keep_size
|
||||
* semantics.
|
||||
*/
|
||||
download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
|
||||
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
|
||||
{
|
||||
walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
|
||||
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
|
||||
return InvalidXLogRecPtr;
|
||||
}
|
||||
|
||||
/*
|
||||
* start from the beginning of the segment to fetch page headers
|
||||
* verifed by XLogReader
|
||||
*/
|
||||
lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
|
||||
wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
|
||||
}
|
||||
}
|
||||
return lrRestartLsn;
|
||||
}
|
||||
|
||||
static const walproposer_api walprop_pg = {
|
||||
@@ -1745,18 +2031,18 @@ static const walproposer_api walprop_pg = {
|
||||
.conn_async_write = walprop_async_write,
|
||||
.conn_blocking_write = walprop_blocking_write,
|
||||
.recovery_download = WalProposerRecovery,
|
||||
.wal_read = walprop_pg_wal_read,
|
||||
.wal_reader_allocate = walprop_pg_wal_reader_allocate,
|
||||
.free_event_set = walprop_pg_free_event_set,
|
||||
.wal_read = walprop_pg_wal_read,
|
||||
.wal_reader_events = walprop_pg_wal_reader_events,
|
||||
.init_event_set = walprop_pg_init_event_set,
|
||||
.update_event_set = walprop_pg_update_event_set,
|
||||
.active_state_update_event_set = walprop_pg_active_state_update_event_set,
|
||||
.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
|
||||
.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
|
||||
.wait_event_set = walprop_pg_wait_event_set,
|
||||
.strong_random = walprop_pg_strong_random,
|
||||
.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
|
||||
.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
|
||||
.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
|
||||
.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
|
||||
.log_internal = walprop_pg_log_internal,
|
||||
.after_election = walprop_pg_after_election,
|
||||
};
|
||||
|
||||
@@ -11,6 +11,7 @@ use proxy::http;
|
||||
use proxy::rate_limiter::EndpointRateLimiter;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
use proxy::rate_limiter::RateLimiterConfig;
|
||||
use proxy::serverless::GlobalConnPoolOptions;
|
||||
use proxy::usage_metrics;
|
||||
|
||||
use anyhow::bail;
|
||||
@@ -95,12 +96,8 @@ struct ProxyCliArgs {
|
||||
/// Allow self-signed certificates for compute nodes (for testing)
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
allow_self_signed_compute: bool,
|
||||
/// timeout for http connections
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_timeout: tokio::time::Duration,
|
||||
/// Whether the SQL over http pool is opt-in
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
sql_over_http_pool_opt_in: bool,
|
||||
#[clap(flatten)]
|
||||
sql_over_http: SqlOverHttpArgs,
|
||||
/// timeout for scram authentication protocol
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
scram_protocol_timeout: tokio::time::Duration,
|
||||
@@ -138,6 +135,36 @@ struct ProxyCliArgs {
|
||||
disable_ip_check_for_http: bool,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
struct SqlOverHttpArgs {
|
||||
/// timeout for http connection requests
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_timeout: tokio::time::Duration,
|
||||
|
||||
/// Whether the SQL over http pool is opt-in
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
sql_over_http_pool_opt_in: bool,
|
||||
|
||||
/// How many connections to pool for each endpoint. Excess connections are discarded
|
||||
#[clap(long, default_value_t = 20)]
|
||||
sql_over_http_pool_max_conns_per_endpoint: usize,
|
||||
|
||||
/// How long pooled connections should remain idle for before closing
|
||||
#[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_idle_timeout: tokio::time::Duration,
|
||||
|
||||
/// Duration each shard will wait on average before a GC sweep.
|
||||
/// A longer time will causes sweeps to take longer but will interfere less frequently.
|
||||
#[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_pool_gc_epoch: tokio::time::Duration,
|
||||
|
||||
/// How many shards should the global pool have. Must be a power of two.
|
||||
/// More shards will introduce less contention for pool operations, but can
|
||||
/// increase memory used by the pool
|
||||
#[clap(long, default_value_t = 128)]
|
||||
sql_over_http_pool_shards: usize,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _logging_guard = proxy::logging::init().await?;
|
||||
@@ -327,8 +354,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
}
|
||||
};
|
||||
let http_config = HttpConfig {
|
||||
timeout: args.sql_over_http_timeout,
|
||||
pool_opt_in: args.sql_over_http_pool_opt_in,
|
||||
request_timeout: args.sql_over_http.sql_over_http_timeout,
|
||||
pool_options: GlobalConnPoolOptions {
|
||||
max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
|
||||
gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
|
||||
pool_shards: args.sql_over_http.sql_over_http_pool_shards,
|
||||
idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
|
||||
opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
|
||||
},
|
||||
};
|
||||
let authentication_config = AuthenticationConfig {
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::{auth, rate_limiter::RateBucketInfo};
|
||||
use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
use rustls::{sign, Certificate, PrivateKey};
|
||||
use sha2::{Digest, Sha256};
|
||||
@@ -36,8 +36,8 @@ pub struct TlsConfig {
|
||||
}
|
||||
|
||||
pub struct HttpConfig {
|
||||
pub timeout: tokio::time::Duration,
|
||||
pub pool_opt_in: bool,
|
||||
pub request_timeout: tokio::time::Duration,
|
||||
pub pool_options: GlobalConnPoolOptions,
|
||||
}
|
||||
|
||||
pub struct AuthenticationConfig {
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
use std::{net::SocketAddr, sync::Arc};
|
||||
use std::sync::Arc;
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
@@ -141,7 +141,7 @@ impl Api {
|
||||
// We'll set username and such later using the startup message.
|
||||
// TODO: add more type safety (in progress).
|
||||
let mut config = compute::ConnCfg::new();
|
||||
config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
|
||||
config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
|
||||
|
||||
let node = NodeInfo {
|
||||
config,
|
||||
@@ -269,9 +269,10 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
|
||||
Err(ApiError::Console { status, text })
|
||||
}
|
||||
|
||||
fn parse_host_port(input: &str) -> Option<(String, u16)> {
|
||||
let parsed: SocketAddr = input.parse().ok()?;
|
||||
Some((parsed.ip().to_string(), parsed.port()))
|
||||
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
|
||||
let (host, port) = input.rsplit_once(':')?;
|
||||
let ipv6_brackets: &[_] = &['[', ']'];
|
||||
Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -279,9 +280,24 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_host_port() {
|
||||
fn test_parse_host_port_v4() {
|
||||
let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
|
||||
assert_eq!(host, "127.0.0.1");
|
||||
assert_eq!(port, 5432);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_host_port_v6() {
|
||||
let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
|
||||
assert_eq!(host, "2001:db8::1");
|
||||
assert_eq!(port, 5432);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_host_port_url() {
|
||||
let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
|
||||
.expect("failed to parse");
|
||||
assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
|
||||
assert_eq!(port, 5432);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,9 +6,13 @@ mod conn_pool;
|
||||
mod sql_over_http;
|
||||
mod websocket;
|
||||
|
||||
pub use conn_pool::GlobalConnPoolOptions;
|
||||
|
||||
use anyhow::bail;
|
||||
use hyper::StatusCode;
|
||||
use metrics::IntCounterPairGuard;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio_util::task::TaskTracker;
|
||||
@@ -47,6 +51,11 @@ pub async fn task_main(
|
||||
|
||||
let conn_pool = conn_pool::GlobalConnPool::new(config);
|
||||
|
||||
let conn_pool2 = Arc::clone(&conn_pool);
|
||||
tokio::spawn(async move {
|
||||
conn_pool2.gc_worker(StdRng::from_entropy()).await;
|
||||
});
|
||||
|
||||
// shutdown the connection pool
|
||||
tokio::spawn({
|
||||
let cancellation_token = cancellation_token.clone();
|
||||
|
||||
@@ -1,15 +1,19 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use futures::future::poll_fn;
|
||||
use futures::{future::poll_fn, Future};
|
||||
use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
|
||||
use once_cell::sync::Lazy;
|
||||
use parking_lot::RwLock;
|
||||
use pbkdf2::{
|
||||
password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
|
||||
Params, Pbkdf2,
|
||||
};
|
||||
use pq_proto::StartupMessageParams;
|
||||
use prometheus::{exponential_buckets, register_histogram, Histogram};
|
||||
use rand::Rng;
|
||||
use smol_str::SmolStr;
|
||||
use std::{collections::HashMap, net::IpAddr, sync::Arc};
|
||||
use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
|
||||
use std::{
|
||||
fmt,
|
||||
task::{ready, Poll},
|
||||
@@ -18,7 +22,7 @@ use std::{
|
||||
ops::Deref,
|
||||
sync::atomic::{self, AtomicUsize},
|
||||
};
|
||||
use tokio::time;
|
||||
use tokio::time::{self, Instant};
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
||||
|
||||
use crate::{
|
||||
@@ -30,11 +34,10 @@ use crate::{
|
||||
};
|
||||
use crate::{compute, config};
|
||||
|
||||
use tracing::{error, warn, Span};
|
||||
use tracing::{debug, error, warn, Span};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
|
||||
pub const APP_NAME: &str = "/sql_over_http";
|
||||
const MAX_CONNS_PER_ENDPOINT: usize = 20;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ConnInfo {
|
||||
@@ -69,6 +72,77 @@ struct ConnPoolEntry {
|
||||
pub struct EndpointConnPool {
|
||||
pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
|
||||
total_conns: usize,
|
||||
max_conns: usize,
|
||||
_guard: IntCounterPairGuard,
|
||||
}
|
||||
|
||||
impl EndpointConnPool {
|
||||
fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
|
||||
let Self {
|
||||
pools, total_conns, ..
|
||||
} = self;
|
||||
pools
|
||||
.get_mut(&db_user)
|
||||
.and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
|
||||
}
|
||||
|
||||
fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
|
||||
let Self {
|
||||
pools, total_conns, ..
|
||||
} = self;
|
||||
if let Some(pool) = pools.get_mut(&db_user) {
|
||||
let old_len = pool.conns.len();
|
||||
pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
|
||||
let new_len = pool.conns.len();
|
||||
let removed = old_len - new_len;
|
||||
*total_conns -= removed;
|
||||
removed > 0
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
|
||||
let conn_id = client.conn_id;
|
||||
|
||||
if client.inner.is_closed() {
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// return connection to the pool
|
||||
let mut returned = false;
|
||||
let mut per_db_size = 0;
|
||||
let total_conns = {
|
||||
let mut pool = pool.write();
|
||||
|
||||
if pool.total_conns < pool.max_conns {
|
||||
// we create this db-user entry in get, so it should not be None
|
||||
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
|
||||
pool_entries.conns.push(ConnPoolEntry {
|
||||
conn: client,
|
||||
_last_access: std::time::Instant::now(),
|
||||
});
|
||||
|
||||
returned = true;
|
||||
per_db_size = pool_entries.conns.len();
|
||||
|
||||
pool.total_conns += 1;
|
||||
}
|
||||
}
|
||||
|
||||
pool.total_conns
|
||||
};
|
||||
|
||||
// do logging outside of the mutex
|
||||
if returned {
|
||||
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
|
||||
} else {
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
|
||||
@@ -87,6 +161,27 @@ pub struct DbUserConnPool {
|
||||
password_hash: Option<PasswordHashString>,
|
||||
}
|
||||
|
||||
impl DbUserConnPool {
|
||||
fn clear_closed_clients(&mut self, conns: &mut usize) {
|
||||
let old_len = self.conns.len();
|
||||
|
||||
self.conns.retain(|conn| !conn.conn.inner.is_closed());
|
||||
|
||||
let new_len = self.conns.len();
|
||||
let removed = old_len - new_len;
|
||||
*conns -= removed;
|
||||
}
|
||||
|
||||
fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
|
||||
self.clear_closed_clients(conns);
|
||||
let conn = self.conns.pop();
|
||||
if conn.is_some() {
|
||||
*conns -= 1;
|
||||
}
|
||||
conn
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GlobalConnPool {
|
||||
// endpoint -> per-endpoint connection pool
|
||||
//
|
||||
@@ -94,52 +189,127 @@ pub struct GlobalConnPool {
|
||||
// pool as early as possible and release the lock.
|
||||
global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
|
||||
|
||||
/// Number of endpoint-connection pools
|
||||
///
|
||||
/// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
|
||||
/// That seems like far too much effort, so we're using a relaxed increment counter instead.
|
||||
/// It's only used for diagnostics.
|
||||
global_pool_size: AtomicUsize,
|
||||
|
||||
proxy_config: &'static crate::config::ProxyConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GlobalConnPoolOptions {
|
||||
// Maximum number of connections per one endpoint.
|
||||
// Can mix different (dbname, username) connections.
|
||||
// When running out of free slots for a particular endpoint,
|
||||
// falls back to opening a new connection for each request.
|
||||
max_conns_per_endpoint: usize,
|
||||
pub max_conns_per_endpoint: usize,
|
||||
|
||||
proxy_config: &'static crate::config::ProxyConfig,
|
||||
pub gc_epoch: Duration,
|
||||
|
||||
// Using a lock to remove any race conditions.
|
||||
// Eg cleaning up connections while a new connection is returned
|
||||
closed: RwLock<bool>,
|
||||
pub pool_shards: usize,
|
||||
|
||||
pub idle_timeout: Duration,
|
||||
|
||||
pub opt_in: bool,
|
||||
}
|
||||
|
||||
pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"proxy_http_pool_reclaimation_lag_seconds",
|
||||
"Time it takes to reclaim unused connection pools",
|
||||
// 1us -> 65ms
|
||||
exponential_buckets(1e-6, 2.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
|
||||
register_int_counter_pair!(
|
||||
"proxy_http_pool_endpoints_registered_total",
|
||||
"Number of endpoints we have registered pools for",
|
||||
"proxy_http_pool_endpoints_unregistered_total",
|
||||
"Number of endpoints we have unregistered pools for",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
impl GlobalConnPool {
|
||||
pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
|
||||
let shards = config.http_config.pool_options.pool_shards;
|
||||
Arc::new(Self {
|
||||
global_pool: DashMap::new(),
|
||||
global_pool: DashMap::with_shard_amount(shards),
|
||||
global_pool_size: AtomicUsize::new(0),
|
||||
max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
|
||||
proxy_config: config,
|
||||
closed: RwLock::new(false),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn shutdown(&self) {
|
||||
*self.closed.write() = true;
|
||||
// drops all strong references to endpoint-pools
|
||||
self.global_pool.clear();
|
||||
}
|
||||
|
||||
self.global_pool.retain(|_, endpoint_pool| {
|
||||
let mut pool = endpoint_pool.write();
|
||||
// by clearing this hashmap, we remove the slots that a connection can be returned to.
|
||||
// when returning, it drops the connection if the slot doesn't exist
|
||||
pool.pools.clear();
|
||||
pool.total_conns = 0;
|
||||
pub async fn gc_worker(&self, mut rng: impl Rng) {
|
||||
let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
|
||||
let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
false
|
||||
let shard = rng.gen_range(0..self.global_pool.shards().len());
|
||||
self.gc(shard);
|
||||
}
|
||||
}
|
||||
|
||||
fn gc(&self, shard: usize) {
|
||||
debug!(shard, "pool: performing epoch reclamation");
|
||||
|
||||
// acquire a random shard lock
|
||||
let mut shard = self.global_pool.shards()[shard].write();
|
||||
|
||||
let timer = GC_LATENCY.start_timer();
|
||||
let current_len = shard.len();
|
||||
shard.retain(|endpoint, x| {
|
||||
// if the current endpoint pool is unique (no other strong or weak references)
|
||||
// then it is currently not in use by any connections.
|
||||
if let Some(pool) = Arc::get_mut(x.get_mut()) {
|
||||
let EndpointConnPool {
|
||||
pools, total_conns, ..
|
||||
} = pool.get_mut();
|
||||
|
||||
// ensure that closed clients are removed
|
||||
pools
|
||||
.iter_mut()
|
||||
.for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
|
||||
|
||||
// we only remove this pool if it has no active connections
|
||||
if *total_conns == 0 {
|
||||
info!("pool: discarding pool for endpoint {endpoint}");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
});
|
||||
let new_len = shard.len();
|
||||
drop(shard);
|
||||
timer.observe_duration();
|
||||
|
||||
let removed = current_len - new_len;
|
||||
|
||||
if removed > 0 {
|
||||
let global_pool_size = self
|
||||
.global_pool_size
|
||||
.fetch_sub(removed, atomic::Ordering::Relaxed)
|
||||
- removed;
|
||||
info!("pool: performed global pool gc. size now {global_pool_size}");
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get(
|
||||
self: &Arc<Self>,
|
||||
conn_info: &ConnInfo,
|
||||
conn_info: ConnInfo,
|
||||
force_new: bool,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: IpAddr,
|
||||
@@ -147,15 +317,11 @@ impl GlobalConnPool {
|
||||
let mut client: Option<ClientInner> = None;
|
||||
let mut latency_timer = LatencyTimer::new("http");
|
||||
|
||||
let pool = if force_new {
|
||||
None
|
||||
} else {
|
||||
Some((conn_info.clone(), self.clone()))
|
||||
};
|
||||
|
||||
let mut hash_valid = false;
|
||||
let mut endpoint_pool = Weak::new();
|
||||
if !force_new {
|
||||
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
|
||||
endpoint_pool = Arc::downgrade(&pool);
|
||||
let mut hash = None;
|
||||
|
||||
// find a pool entry by (dbname, username) if exists
|
||||
@@ -180,12 +346,8 @@ impl GlobalConnPool {
|
||||
// we will continue with the regular connection flow
|
||||
if validate.is_ok() {
|
||||
hash_valid = true;
|
||||
let mut pool = pool.write();
|
||||
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
|
||||
if let Some(entry) = pool_entries.conns.pop() {
|
||||
client = Some(entry.conn);
|
||||
pool.total_conns -= 1;
|
||||
}
|
||||
if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
|
||||
client = Some(entry.conn)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -198,11 +360,12 @@ impl GlobalConnPool {
|
||||
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
|
||||
connect_to_compute(
|
||||
self.proxy_config,
|
||||
conn_info,
|
||||
&conn_info,
|
||||
conn_id,
|
||||
session_id,
|
||||
latency_timer,
|
||||
peer_addr,
|
||||
endpoint_pool.clone(),
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
@@ -214,18 +377,19 @@ impl GlobalConnPool {
|
||||
);
|
||||
latency_timer.pool_hit();
|
||||
latency_timer.success();
|
||||
return Ok(Client::new(client, pool).await);
|
||||
return Ok(Client::new(client, conn_info, endpoint_pool).await);
|
||||
}
|
||||
} else {
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
|
||||
connect_to_compute(
|
||||
self.proxy_config,
|
||||
conn_info,
|
||||
&conn_info,
|
||||
conn_id,
|
||||
session_id,
|
||||
latency_timer,
|
||||
peer_addr,
|
||||
endpoint_pool.clone(),
|
||||
)
|
||||
.await
|
||||
};
|
||||
@@ -269,59 +433,7 @@ impl GlobalConnPool {
|
||||
_ => {}
|
||||
}
|
||||
let new_client = new_client?;
|
||||
Ok(Client::new(new_client, pool).await)
|
||||
}
|
||||
|
||||
fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
|
||||
let conn_id = client.conn_id;
|
||||
|
||||
// We want to hold this open while we return. This ensures that the pool can't close
|
||||
// while we are in the middle of returning the connection.
|
||||
let closed = self.closed.read();
|
||||
if *closed {
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if client.inner.is_closed() {
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
|
||||
|
||||
// return connection to the pool
|
||||
let mut returned = false;
|
||||
let mut per_db_size = 0;
|
||||
let total_conns = {
|
||||
let mut pool = pool.write();
|
||||
|
||||
if pool.total_conns < self.max_conns_per_endpoint {
|
||||
// we create this db-user entry in get, so it should not be None
|
||||
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
|
||||
pool_entries.conns.push(ConnPoolEntry {
|
||||
conn: client,
|
||||
_last_access: std::time::Instant::now(),
|
||||
});
|
||||
|
||||
returned = true;
|
||||
per_db_size = pool_entries.conns.len();
|
||||
|
||||
pool.total_conns += 1;
|
||||
}
|
||||
}
|
||||
|
||||
pool.total_conns
|
||||
};
|
||||
|
||||
// do logging outside of the mutex
|
||||
if returned {
|
||||
info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
|
||||
} else {
|
||||
info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(Client::new(new_client, conn_info, endpoint_pool).await)
|
||||
}
|
||||
|
||||
fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
|
||||
@@ -334,6 +446,12 @@ impl GlobalConnPool {
|
||||
let new_pool = Arc::new(RwLock::new(EndpointConnPool {
|
||||
pools: HashMap::new(),
|
||||
total_conns: 0,
|
||||
max_conns: self
|
||||
.proxy_config
|
||||
.http_config
|
||||
.pool_options
|
||||
.max_conns_per_endpoint,
|
||||
_guard: ENDPOINT_POOLS.guard(),
|
||||
}));
|
||||
|
||||
// find or create a pool for this endpoint
|
||||
@@ -363,9 +481,11 @@ impl GlobalConnPool {
|
||||
}
|
||||
|
||||
struct TokioMechanism<'a> {
|
||||
pool: Weak<RwLock<EndpointConnPool>>,
|
||||
conn_info: &'a ConnInfo,
|
||||
session_id: uuid::Uuid,
|
||||
conn_id: uuid::Uuid,
|
||||
idle: Duration,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -385,6 +505,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
|
||||
timeout,
|
||||
self.conn_id,
|
||||
self.session_id,
|
||||
self.pool.clone(),
|
||||
self.idle,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -403,6 +525,7 @@ async fn connect_to_compute(
|
||||
session_id: uuid::Uuid,
|
||||
latency_timer: LatencyTimer,
|
||||
peer_addr: IpAddr,
|
||||
pool: Weak<RwLock<EndpointConnPool>>,
|
||||
) -> anyhow::Result<ClientInner> {
|
||||
let tls = config.tls_config.as_ref();
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
@@ -447,6 +570,8 @@ async fn connect_to_compute(
|
||||
conn_id,
|
||||
conn_info,
|
||||
session_id,
|
||||
pool,
|
||||
idle: config.http_config.pool_options.idle_timeout,
|
||||
},
|
||||
node_info,
|
||||
&extra,
|
||||
@@ -462,6 +587,8 @@ async fn connect_to_compute_once(
|
||||
timeout: time::Duration,
|
||||
conn_id: uuid::Uuid,
|
||||
mut session: uuid::Uuid,
|
||||
pool: Weak<RwLock<EndpointConnPool>>,
|
||||
idle: Duration,
|
||||
) -> Result<ClientInner, tokio_postgres::Error> {
|
||||
let mut config = (*node_info.config).clone();
|
||||
|
||||
@@ -490,13 +617,29 @@ async fn connect_to_compute_once(
|
||||
branch_id: node_info.aux.branch_id.clone(),
|
||||
};
|
||||
|
||||
let db_user = conn_info.db_and_user();
|
||||
tokio::spawn(
|
||||
async move {
|
||||
let _conn_gauge = conn_gauge;
|
||||
let mut idle_timeout = pin!(tokio::time::sleep(idle));
|
||||
poll_fn(move |cx| {
|
||||
if matches!(rx.has_changed(), Ok(true)) {
|
||||
session = *rx.borrow_and_update();
|
||||
info!(%session, "changed session");
|
||||
idle_timeout.as_mut().reset(Instant::now() + idle);
|
||||
}
|
||||
|
||||
// 5 minute idle connection timeout
|
||||
if idle_timeout.as_mut().poll(cx).is_ready() {
|
||||
idle_timeout.as_mut().reset(Instant::now() + idle);
|
||||
info!("connection idle");
|
||||
if let Some(pool) = pool.clone().upgrade() {
|
||||
// remove client from pool - should close the connection if it's idle.
|
||||
// does nothing if the client is currently checked-out and in-use
|
||||
if pool.write().remove_client(db_user.clone(), conn_id) {
|
||||
info!("idle connection removed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
@@ -514,15 +657,25 @@ async fn connect_to_compute_once(
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(%session, "connection error: {}", e);
|
||||
return Poll::Ready(())
|
||||
break
|
||||
}
|
||||
None => {
|
||||
info!("connection closed");
|
||||
return Poll::Ready(())
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}).await
|
||||
|
||||
// remove from connection pool
|
||||
if let Some(pool) = pool.clone().upgrade() {
|
||||
if pool.write().remove_client(db_user.clone(), conn_id) {
|
||||
info!("closed connection removed");
|
||||
}
|
||||
}
|
||||
|
||||
Poll::Ready(())
|
||||
}).await;
|
||||
|
||||
}
|
||||
.instrument(span)
|
||||
);
|
||||
@@ -552,23 +705,27 @@ pub struct Client {
|
||||
conn_id: uuid::Uuid,
|
||||
span: Span,
|
||||
inner: Option<ClientInner>,
|
||||
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
|
||||
conn_info: ConnInfo,
|
||||
pool: Weak<RwLock<EndpointConnPool>>,
|
||||
}
|
||||
|
||||
pub struct Discard<'a> {
|
||||
conn_id: uuid::Uuid,
|
||||
pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
|
||||
conn_info: &'a ConnInfo,
|
||||
pool: &'a mut Weak<RwLock<EndpointConnPool>>,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub(self) async fn new(
|
||||
inner: ClientInner,
|
||||
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
|
||||
conn_info: ConnInfo,
|
||||
pool: Weak<RwLock<EndpointConnPool>>,
|
||||
) -> Self {
|
||||
Self {
|
||||
conn_id: inner.conn_id,
|
||||
inner: Some(inner),
|
||||
span: Span::current(),
|
||||
conn_info,
|
||||
pool,
|
||||
}
|
||||
}
|
||||
@@ -577,6 +734,7 @@ impl Client {
|
||||
inner,
|
||||
pool,
|
||||
conn_id,
|
||||
conn_info,
|
||||
span: _,
|
||||
} = self;
|
||||
(
|
||||
@@ -586,6 +744,7 @@ impl Client {
|
||||
.inner,
|
||||
Discard {
|
||||
pool,
|
||||
conn_info,
|
||||
conn_id: *conn_id,
|
||||
},
|
||||
)
|
||||
@@ -601,14 +760,14 @@ impl Client {
|
||||
|
||||
impl Discard<'_> {
|
||||
pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
|
||||
if status != ReadyForQueryStatus::Idle {
|
||||
if let Some((conn_info, _)) = self.pool.take() {
|
||||
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
|
||||
}
|
||||
let conn_info = &self.conn_info;
|
||||
if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
|
||||
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
|
||||
}
|
||||
}
|
||||
pub fn discard(&mut self) {
|
||||
if let Some((conn_info, _)) = self.pool.take() {
|
||||
let conn_info = &self.conn_info;
|
||||
if std::mem::take(self.pool).strong_count() > 0 {
|
||||
info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
|
||||
}
|
||||
}
|
||||
@@ -628,16 +787,17 @@ impl Deref for Client {
|
||||
|
||||
impl Drop for Client {
|
||||
fn drop(&mut self) {
|
||||
let conn_info = self.conn_info.clone();
|
||||
let client = self
|
||||
.inner
|
||||
.take()
|
||||
.expect("client inner should not be removed");
|
||||
if let Some((conn_info, conn_pool)) = self.pool.take() {
|
||||
if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
|
||||
let current_span = self.span.clone();
|
||||
// return connection to the pool
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _span = current_span.enter();
|
||||
let _ = conn_pool.put(&conn_info, client);
|
||||
let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,7 +206,7 @@ pub async fn handle(
|
||||
config: &'static HttpConfig,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let result = tokio::time::timeout(
|
||||
config.timeout,
|
||||
config.request_timeout,
|
||||
handle_inner(
|
||||
config,
|
||||
request,
|
||||
@@ -278,7 +278,7 @@ pub async fn handle(
|
||||
Err(_) => {
|
||||
let message = format!(
|
||||
"HTTP-Connection timed out, execution time exeeded {} seconds",
|
||||
config.timeout.as_secs()
|
||||
config.request_timeout.as_secs()
|
||||
);
|
||||
error!(message);
|
||||
json_response(
|
||||
@@ -320,7 +320,8 @@ async fn handle_inner(
|
||||
|
||||
// Allow connection pooling only if explicitly requested
|
||||
// or if we have decided that http pool is no longer opt-in
|
||||
let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
|
||||
let allow_pool =
|
||||
!config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
// isolation level, read only and deferrable
|
||||
|
||||
@@ -359,7 +360,7 @@ async fn handle_inner(
|
||||
let payload: Payload = serde_json::from_slice(&body)?;
|
||||
|
||||
let mut client = conn_pool
|
||||
.get(&conn_info, !allow_pool, session_id, peer_addr)
|
||||
.get(conn_info, !allow_pool, session_id, peer_addr)
|
||||
.await?;
|
||||
|
||||
let mut response = Response::builder()
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::{types::ObjectIdentifier, Client};
|
||||
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use tracing::{error, info, warn};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::cloud_admin_api::BranchData;
|
||||
use crate::metadata_stream::stream_listing;
|
||||
@@ -40,7 +43,7 @@ impl TimelineAnalysis {
|
||||
|
||||
pub(crate) fn branch_cleanup_and_check_errors(
|
||||
id: &TenantShardTimelineId,
|
||||
s3_root: &RootTarget,
|
||||
tenant_objects: &mut TenantObjectListing,
|
||||
s3_active_branch: Option<&BranchData>,
|
||||
console_branch: Option<BranchData>,
|
||||
s3_data: Option<S3TimelineBlobData>,
|
||||
@@ -72,8 +75,8 @@ pub(crate) fn branch_cleanup_and_check_errors(
|
||||
match s3_data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
index_part,
|
||||
index_part_generation,
|
||||
mut s3_layers,
|
||||
index_part_generation: _index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} => {
|
||||
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
|
||||
result.errors.push(format!(
|
||||
@@ -111,65 +114,19 @@ pub(crate) fn branch_cleanup_and_check_errors(
|
||||
))
|
||||
}
|
||||
|
||||
let layer_map_key = (layer, metadata.generation);
|
||||
if !s3_layers.remove(&layer_map_key) {
|
||||
if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
|
||||
// FIXME: this will emit false positives if an index was
|
||||
// uploaded concurrently with our scan. To make this check
|
||||
// correct, we need to try sending a HEAD request for the
|
||||
// layer we think is missing.
|
||||
result.errors.push(format!(
|
||||
"index_part.json contains a layer {}{} that is not present in remote storage",
|
||||
layer_map_key.0.file_name(),
|
||||
layer_map_key.1.get_suffix()
|
||||
"index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
|
||||
layer.file_name(),
|
||||
metadata.generation.get_suffix(),
|
||||
metadata.shard
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
|
||||
.into_iter()
|
||||
.filter(|(_layer_name, gen)|
|
||||
// A layer is only considered orphaned if it has a generation below
|
||||
// the index. If the generation is >= the index, then the layer may
|
||||
// be an upload from a running pageserver, or even an upload from
|
||||
// a new generation that didn't upload an index yet.
|
||||
//
|
||||
// Even so, a layer that is not referenced by the index could just
|
||||
// be something enqueued for deletion, so while this check is valid
|
||||
// for indicating that a layer is garbage, it is not an indicator
|
||||
// of a problem.
|
||||
gen < &index_part_generation)
|
||||
.collect();
|
||||
|
||||
if !orphan_layers.is_empty() {
|
||||
// An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report
|
||||
// these as a hint that there is something worth cleaning up here.
|
||||
result.warnings.push(format!(
|
||||
"index_part.json does not contain layers from S3: {:?}",
|
||||
orphan_layers
|
||||
.iter()
|
||||
.map(|(layer_name, gen)| format!(
|
||||
"{}{}",
|
||||
layer_name.file_name(),
|
||||
gen.get_suffix()
|
||||
))
|
||||
.collect::<Vec<_>>(),
|
||||
));
|
||||
result.garbage_keys.extend(orphan_layers.iter().map(
|
||||
|(layer_name, layer_gen)| {
|
||||
let mut key = s3_root.timeline_root(id).prefix_in_bucket;
|
||||
let delimiter = s3_root.delimiter();
|
||||
if !key.ends_with(delimiter) {
|
||||
key.push_str(delimiter);
|
||||
}
|
||||
key.push_str(&format!(
|
||||
"{}{}",
|
||||
&layer_name.file_name(),
|
||||
layer_gen.get_suffix()
|
||||
));
|
||||
key
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
BlobDataParseResult::Relic => {}
|
||||
BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
|
||||
@@ -204,6 +161,83 @@ pub(crate) fn branch_cleanup_and_check_errors(
|
||||
result
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct LayerRef {
|
||||
ref_count: usize,
|
||||
}
|
||||
|
||||
/// Top-level index of objects in a tenant. This may be used by any shard-timeline within
|
||||
/// the tenant to query whether an object exists.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct TenantObjectListing {
|
||||
shard_timelines:
|
||||
HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
|
||||
}
|
||||
|
||||
impl TenantObjectListing {
|
||||
/// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall
|
||||
/// list of layer keys for the Tenant.
|
||||
pub(crate) fn push(
|
||||
&mut self,
|
||||
ttid: TenantShardTimelineId,
|
||||
layers: HashSet<(LayerFileName, Generation)>,
|
||||
) {
|
||||
let shard_index = ShardIndex::new(
|
||||
ttid.tenant_shard_id.shard_number,
|
||||
ttid.tenant_shard_id.shard_count,
|
||||
);
|
||||
let replaced = self.shard_timelines.insert(
|
||||
(shard_index, ttid.timeline_id),
|
||||
layers
|
||||
.into_iter()
|
||||
.map(|l| (l, LayerRef::default()))
|
||||
.collect(),
|
||||
);
|
||||
|
||||
assert!(
|
||||
replaced.is_none(),
|
||||
"Built from an S3 object listing, which should never repeat a key"
|
||||
);
|
||||
}
|
||||
|
||||
/// Having loaded a timeline index, check if a layer referenced by the index exists. If it does,
|
||||
/// the layer's refcount will be incremented. Later, after calling this for all references in all indices
|
||||
/// in a tenant, orphan layers may be detected by their zero refcounts.
|
||||
///
|
||||
/// Returns true if the layer exists
|
||||
pub(crate) fn check_ref(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
layer_file: &LayerFileName,
|
||||
metadata: &IndexLayerMetadata,
|
||||
) -> bool {
|
||||
let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
layer_ref.ref_count += 1;
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
|
||||
let mut result = Vec::new();
|
||||
for ((shard_index, timeline_id), layers) in &self.shard_timelines {
|
||||
for ((layer_file, generation), layer_ref) in layers {
|
||||
if layer_ref.ref_count == 0 {
|
||||
result.push((*shard_index, *timeline_id, layer_file.clone(), *generation))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct S3TimelineBlobData {
|
||||
pub(crate) blob_data: BlobDataParseResult,
|
||||
|
||||
@@ -15,6 +15,7 @@ use anyhow::Context;
|
||||
use aws_config::environment::EnvironmentVariableCredentialsProvider;
|
||||
use aws_config::imds::credentials::ImdsCredentialsProvider;
|
||||
use aws_config::meta::credentials::CredentialsProviderChain;
|
||||
use aws_config::profile::ProfileFileCredentialsProvider;
|
||||
use aws_config::sso::SsoCredentialsProvider;
|
||||
use aws_config::BehaviorVersion;
|
||||
use aws_sdk_s3::config::Region;
|
||||
@@ -255,6 +256,11 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
|
||||
let chain = CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
.or_else(
|
||||
"profile-sso",
|
||||
ProfileFileCredentialsProvider::builder().build(),
|
||||
);
|
||||
|
||||
// Use SSO if we were given an account ID
|
||||
@@ -265,7 +271,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
|
||||
.account_id(sso_account)
|
||||
.role_name("PowerUserAccess")
|
||||
.start_url("https://neondb.awsapps.com/start")
|
||||
.region(Region::from_static("eu-central-1"))
|
||||
.region(bucket_region.clone())
|
||||
.build(),
|
||||
),
|
||||
None => chain,
|
||||
|
||||
@@ -2,22 +2,25 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use crate::checks::{
|
||||
branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
|
||||
TimelineAnalysis,
|
||||
TenantObjectListing, TimelineAnalysis,
|
||||
};
|
||||
use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
|
||||
use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
|
||||
use aws_sdk_s3::Client;
|
||||
use futures_util::{pin_mut, StreamExt, TryStreamExt};
|
||||
use histogram::Histogram;
|
||||
use pageserver::tenant::remote_timeline_client::remote_layer_path;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::Serialize;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct MetadataSummary {
|
||||
count: usize,
|
||||
with_errors: HashSet<TenantShardTimelineId>,
|
||||
with_warnings: HashSet<TenantShardTimelineId>,
|
||||
with_garbage: HashSet<TenantShardTimelineId>,
|
||||
with_orphans: HashSet<TenantShardTimelineId>,
|
||||
indices_by_version: HashMap<usize, usize>,
|
||||
|
||||
layer_count: MinMaxHisto,
|
||||
@@ -87,7 +90,7 @@ impl MetadataSummary {
|
||||
count: 0,
|
||||
with_errors: HashSet::new(),
|
||||
with_warnings: HashSet::new(),
|
||||
with_garbage: HashSet::new(),
|
||||
with_orphans: HashSet::new(),
|
||||
indices_by_version: HashMap::new(),
|
||||
layer_count: MinMaxHisto::new(),
|
||||
timeline_size_bytes: MinMaxHisto::new(),
|
||||
@@ -141,6 +144,10 @@ impl MetadataSummary {
|
||||
}
|
||||
}
|
||||
|
||||
fn notify_timeline_orphan(&mut self, ttid: &TenantShardTimelineId) {
|
||||
self.with_orphans.insert(*ttid);
|
||||
}
|
||||
|
||||
/// Long-form output for printing at end of a scan
|
||||
pub fn summary_string(&self) -> String {
|
||||
let version_summary: String = itertools::join(
|
||||
@@ -154,7 +161,7 @@ impl MetadataSummary {
|
||||
"Timelines: {0}
|
||||
With errors: {1}
|
||||
With warnings: {2}
|
||||
With garbage: {3}
|
||||
With orphan layers: {3}
|
||||
Index versions: {version_summary}
|
||||
Timeline size bytes: {4}
|
||||
Layer size bytes: {5}
|
||||
@@ -163,7 +170,7 @@ Timeline layer count: {6}
|
||||
self.count,
|
||||
self.with_errors.len(),
|
||||
self.with_warnings.len(),
|
||||
self.with_garbage.len(),
|
||||
self.with_orphans.len(),
|
||||
self.timeline_size_bytes.oneline(),
|
||||
self.layer_size_bytes.oneline(),
|
||||
self.layer_count.oneline(),
|
||||
@@ -191,7 +198,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
|
||||
|
||||
// Generate a stream of TenantTimelineId
|
||||
let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
|
||||
let timelines = timelines.try_buffer_unordered(CONCURRENCY);
|
||||
let timelines = timelines.try_buffered(CONCURRENCY);
|
||||
let timelines = timelines.try_flatten();
|
||||
|
||||
// Generate a stream of S3TimelineBlobData
|
||||
@@ -204,17 +211,118 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
|
||||
Ok((ttid, data))
|
||||
}
|
||||
let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
|
||||
let timelines = timelines.try_buffer_unordered(CONCURRENCY);
|
||||
let timelines = timelines.try_buffered(CONCURRENCY);
|
||||
|
||||
// We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
|
||||
// shards in the same tenant might refer to one anothers' keys if a shard split has happened.
|
||||
|
||||
let mut tenant_id = None;
|
||||
let mut tenant_objects = TenantObjectListing::default();
|
||||
let mut tenant_timeline_results = Vec::new();
|
||||
|
||||
fn analyze_tenant(
|
||||
tenant_id: TenantId,
|
||||
summary: &mut MetadataSummary,
|
||||
mut tenant_objects: TenantObjectListing,
|
||||
timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
|
||||
) {
|
||||
let mut timeline_generations = HashMap::new();
|
||||
for (ttid, data) in timelines {
|
||||
// Stash the generation of each timeline, for later use identifying orphan layers
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part: _index_part,
|
||||
index_part_generation,
|
||||
s3_layers: _s3_layers,
|
||||
} = &data.blob_data
|
||||
{
|
||||
timeline_generations.insert(ttid, *index_part_generation);
|
||||
}
|
||||
|
||||
// Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
|
||||
// reference counts for layers across the tenant.
|
||||
let analysis =
|
||||
branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
}
|
||||
|
||||
// Identifying orphan layers must be done on a tenant-wide basis, because individual
|
||||
// shards' layers may be referenced by other shards.
|
||||
//
|
||||
// Orphan layers are not a corruption, and not an indication of a problem. They are just
|
||||
// consuming some space in remote storage, and may be cleaned up at leisure.
|
||||
for (shard_index, timeline_id, layer_file, generation) in tenant_objects.get_orphans() {
|
||||
let ttid = TenantShardTimelineId {
|
||||
tenant_shard_id: TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: shard_index.shard_count,
|
||||
shard_number: shard_index.shard_number,
|
||||
},
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
if let Some(timeline_generation) = timeline_generations.get(&ttid) {
|
||||
if &generation >= timeline_generation {
|
||||
// Candidate orphan layer is in the current or future generation relative
|
||||
// to the index we read for this timeline shard, so its absence from the index
|
||||
// doesn't make it an orphan: more likely, it is a case where the layer was
|
||||
// uploaded, but the index referencing the layer wasn't written yet.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let orphan_path = remote_layer_path(
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
shard_index,
|
||||
&layer_file,
|
||||
generation,
|
||||
);
|
||||
|
||||
tracing::info!("Orphan layer detected: {orphan_path}");
|
||||
|
||||
summary.notify_timeline_orphan(&ttid);
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate through all the timeline results. These are in key-order, so
|
||||
// all results for the same tenant will be adjacent. We accumulate these,
|
||||
// and then call `analyze_tenant` to flush, when we see the next tenant ID.
|
||||
let mut summary = MetadataSummary::new();
|
||||
pin_mut!(timelines);
|
||||
while let Some(i) = timelines.next().await {
|
||||
let (ttid, data) = i?;
|
||||
summary.update_data(&data);
|
||||
|
||||
let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
|
||||
match tenant_id {
|
||||
None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
|
||||
Some(prev_tenant_id) => {
|
||||
if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
|
||||
let tenant_objects = std::mem::take(&mut tenant_objects);
|
||||
let timelines = std::mem::take(&mut tenant_timeline_results);
|
||||
analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
|
||||
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part: _index_part,
|
||||
index_part_generation: _index_part_generation,
|
||||
s3_layers,
|
||||
} = &data.blob_data
|
||||
{
|
||||
tenant_objects.push(ttid, s3_layers.clone());
|
||||
}
|
||||
tenant_timeline_results.push((ttid, data));
|
||||
}
|
||||
|
||||
if !tenant_timeline_results.is_empty() {
|
||||
analyze_tenant(
|
||||
tenant_id.expect("Must be set if results are present"),
|
||||
&mut summary,
|
||||
tenant_objects,
|
||||
tenant_timeline_results,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(summary)
|
||||
|
||||
2
scripts/sk_collect_dumps/.gitignore
vendored
2
scripts/sk_collect_dumps/.gitignore
vendored
@@ -1,2 +1,4 @@
|
||||
result
|
||||
*.json
|
||||
hosts
|
||||
poetry.lock
|
||||
|
||||
11
scripts/sk_collect_dumps/ansible.cfg
Normal file
11
scripts/sk_collect_dumps/ansible.cfg
Normal file
@@ -0,0 +1,11 @@
|
||||
[defaults]
|
||||
host_key_checking = False
|
||||
inventory=./hosts
|
||||
remote_tmp=/tmp
|
||||
remote_user=developer
|
||||
callbacks_enabled = profile_tasks
|
||||
|
||||
[ssh_connection]
|
||||
scp_if_ssh = True
|
||||
ssh_args = -F ./ssh.cfg
|
||||
pipelining = True
|
||||
16
scripts/sk_collect_dumps/pyproject.toml
Normal file
16
scripts/sk_collect_dumps/pyproject.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[tool.poetry]
|
||||
name = "sk-collect-dumps"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Arseny Sher <sher-ars@yandex.ru>"]
|
||||
readme = "README.md"
|
||||
packages = [{include = "sk_collect_dumps"}]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
ansible = "^9.1.0"
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
@@ -1,25 +1,43 @@
|
||||
# Collect /v1/debug_dump from all safekeeper nodes
|
||||
|
||||
1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
|
||||
2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
|
||||
|
||||
## How to use ansible (staging)
|
||||
|
||||
3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
|
||||
```
|
||||
AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
# staging:
|
||||
AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
|
||||
# prod:
|
||||
AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
|
||||
# check
|
||||
echo $AUTH_TOKEN
|
||||
```
|
||||
2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
|
||||
|
||||
AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
There are two ways to do that, with ssm or tsh. ssm:
|
||||
```
|
||||
# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit):
|
||||
AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml
|
||||
```
|
||||
It will put the results to .results directory *near the playbook*.
|
||||
|
||||
tsh:
|
||||
|
||||
Update the inventory, if needed, selecting .build/.tech and optionally region:
|
||||
```
|
||||
rm -f hosts && echo '[safekeeper]' >> hosts
|
||||
# staging:
|
||||
tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts
|
||||
# prod:
|
||||
tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts
|
||||
```
|
||||
|
||||
## How to use ansible (prod)
|
||||
|
||||
Test ansible connection:
|
||||
```
|
||||
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
|
||||
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
|
||||
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
|
||||
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
ansible all -m ping -v
|
||||
```
|
||||
|
||||
Download the dumps:
|
||||
```
|
||||
mkdir -p result && rm -f result/*
|
||||
ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml
|
||||
```
|
||||
|
||||
3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
|
||||
|
||||
@@ -1,18 +1,37 @@
|
||||
- name: Fetch state dumps from safekeepers
|
||||
hosts: safekeepers
|
||||
hosts: safekeeper
|
||||
gather_facts: False
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
tasks:
|
||||
- name: Download file
|
||||
- name: Dump file
|
||||
get_url:
|
||||
url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false"
|
||||
dest: "/tmp/{{ inventory_hostname }}.json"
|
||||
dest: "/tmp/{{ inventory_hostname }}-dump.json"
|
||||
headers:
|
||||
Authorization: "Bearer {{ auth_token }}"
|
||||
|
||||
- name: Fetch file from remote hosts
|
||||
- name: install rsync
|
||||
ansible.builtin.apt:
|
||||
name: rsync
|
||||
update_cache: yes
|
||||
become: yes
|
||||
ignore_errors: true # it can be already installed and we don't always have sudo
|
||||
|
||||
- name: Fetch file from remote hosts (works only with ssm)
|
||||
fetch:
|
||||
src: "/tmp/{{ inventory_hostname }}.json"
|
||||
dest: "./result/{{ inventory_hostname }}.json"
|
||||
src: "/tmp/{{ inventory_hostname }}-dump.json"
|
||||
dest: "./result/{{ inventory_hostname }}-dump.json"
|
||||
flat: yes
|
||||
fail_on_missing: no
|
||||
when: ansible_connection == "aws_ssm"
|
||||
|
||||
# xxx not sure how to make ansible 'synchronize' work with tsh
|
||||
- name: Fetch file from remote hosts
|
||||
shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json" "./result/{{ inventory_hostname }}-dump.json"
|
||||
delegate_to: localhost
|
||||
when: ansible_connection != "aws_ssm"
|
||||
|
||||
- name: remove remote dumps
|
||||
ansible.builtin.file:
|
||||
path: "/tmp/{{ inventory_hostname }}-dump.json"
|
||||
state: absent
|
||||
|
||||
13
scripts/sk_collect_dumps/ssh.cfg
Normal file
13
scripts/sk_collect_dumps/ssh.cfg
Normal file
@@ -0,0 +1,13 @@
|
||||
# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh
|
||||
|
||||
# Common flags for all teleport.aws.neon.tech hosts
|
||||
Host *
|
||||
HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com
|
||||
|
||||
# Flags for all teleport.aws.neon.tech hosts except the proxy
|
||||
Host * !teleport.aws.neon.tech
|
||||
Port 3022
|
||||
ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p
|
||||
User developer
|
||||
|
||||
# End generated Teleport configuration
|
||||
@@ -31,22 +31,22 @@ SELECT
|
||||
(data->>'tenant_id') AS tenant_id,
|
||||
(data->>'timeline_id') AS timeline_id,
|
||||
(data->'memory'->>'active')::bool AS active,
|
||||
(data->'memory'->>'flush_lsn')::bigint AS flush_lsn,
|
||||
(data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn,
|
||||
(data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn,
|
||||
(data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn,
|
||||
(data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn,
|
||||
(data->'memory'->>'write_lsn')::bigint AS write_lsn,
|
||||
(data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn,
|
||||
(data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn,
|
||||
(data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn,
|
||||
(data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn,
|
||||
(data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn,
|
||||
(data->'memory'->>'write_lsn')::pg_lsn AS write_lsn,
|
||||
(data->'memory'->>'num_computes')::bigint AS num_computes,
|
||||
(data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn,
|
||||
(data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn,
|
||||
(data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno,
|
||||
(data->'memory'->>'is_cancelled')::bool AS is_cancelled,
|
||||
(data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn,
|
||||
(data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn,
|
||||
(data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn,
|
||||
(data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn,
|
||||
(data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term,
|
||||
(data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn,
|
||||
(data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn,
|
||||
(data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn,
|
||||
(data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn
|
||||
(data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn,
|
||||
(data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn,
|
||||
(data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn,
|
||||
(data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn
|
||||
FROM tmp_json
|
||||
EOF
|
||||
|
||||
@@ -3,9 +3,12 @@ use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use clap::Parser;
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
|
||||
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::{
|
||||
FilterTenantTimelineId, MessageType, SubscribeByFilterRequest,
|
||||
TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage,
|
||||
};
|
||||
|
||||
use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT};
|
||||
use tokio::time;
|
||||
@@ -91,15 +94,23 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
|
||||
None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
|
||||
};
|
||||
|
||||
let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
|
||||
let ttid = ProtoTenantTimelineId {
|
||||
tenant_id: vec![0xFF; 16],
|
||||
timeline_id: tli_from_u64(i),
|
||||
});
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
subscription_key: Some(key),
|
||||
};
|
||||
let mut stream = client
|
||||
.subscribe_safekeeper_info(request)
|
||||
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![TypeSubscription {
|
||||
r#type: MessageType::SafekeeperTimelineInfo.into(),
|
||||
}],
|
||||
tenant_timeline_id: Some(FilterTenantTimelineId {
|
||||
enabled: true,
|
||||
tenant_timeline_id: Some(ttid),
|
||||
}),
|
||||
};
|
||||
|
||||
let mut stream: tonic::Streaming<TypedMessage> = client
|
||||
.subscribe_by_filter(request)
|
||||
.await
|
||||
.unwrap()
|
||||
.into_inner();
|
||||
|
||||
@@ -10,6 +10,12 @@ service BrokerService {
|
||||
|
||||
// Publish safekeeper updates.
|
||||
rpc PublishSafekeeperInfo(stream SafekeeperTimelineInfo) returns (google.protobuf.Empty) {};
|
||||
|
||||
// Subscribe to all messages, limited by a filter.
|
||||
rpc SubscribeByFilter(SubscribeByFilterRequest) returns (stream TypedMessage) {};
|
||||
|
||||
// Publish one message.
|
||||
rpc PublishOne(TypedMessage) returns (google.protobuf.Empty) {};
|
||||
}
|
||||
|
||||
message SubscribeSafekeeperInfoRequest {
|
||||
@@ -48,3 +54,55 @@ message TenantTimelineId {
|
||||
bytes tenant_id = 1;
|
||||
bytes timeline_id = 2;
|
||||
}
|
||||
|
||||
message FilterTenantTimelineId {
|
||||
// If true, only messages related to `tenant_timeline_id` will be emitted.
|
||||
// Otherwise, messages for all timelines will be emitted.
|
||||
bool enabled = 1;
|
||||
TenantTimelineId tenant_timeline_id = 2;
|
||||
}
|
||||
|
||||
message TypeSubscription {
|
||||
MessageType type = 1;
|
||||
}
|
||||
|
||||
message SubscribeByFilterRequest {
|
||||
// Subscription will emit messages only of the specified types. You need to specify
|
||||
// at least one type to receive any messages.
|
||||
repeated TypeSubscription types = 1;
|
||||
|
||||
// If set and enabled, subscription will emit messages only for the specified tenant/timeline.
|
||||
optional FilterTenantTimelineId tenant_timeline_id = 2;
|
||||
}
|
||||
|
||||
enum MessageType {
|
||||
UNKNOWN = 0;
|
||||
SAFEKEEPER_TIMELINE_INFO = 2;
|
||||
SAFEKEEPER_DISCOVERY_REQUEST = 3;
|
||||
SAFEKEEPER_DISCOVERY_RESPONSE = 4;
|
||||
}
|
||||
|
||||
// A message with a type.
|
||||
message TypedMessage {
|
||||
MessageType type = 1;
|
||||
|
||||
optional SafekeeperTimelineInfo safekeeper_timeline_info = 2;
|
||||
optional SafekeeperDiscoveryRequest safekeeper_discovery_request = 3;
|
||||
optional SafekeeperDiscoveryResponse safekeeper_discovery_response = 4;
|
||||
}
|
||||
|
||||
message SafekeeperDiscoveryRequest {
|
||||
TenantTimelineId tenant_timeline_id = 1;
|
||||
}
|
||||
|
||||
// Shorter version of SafekeeperTimelineInfo, contains only necessary fields.
|
||||
message SafekeeperDiscoveryResponse {
|
||||
uint64 safekeeper_id = 1;
|
||||
TenantTimelineId tenant_timeline_id = 2;
|
||||
// WAL available to download.
|
||||
uint64 commit_lsn = 3;
|
||||
// A connection string to use for WAL downloading.
|
||||
string safekeeper_connstr = 4;
|
||||
// Availability zone of a safekeeper.
|
||||
optional string availability_zone = 5;
|
||||
}
|
||||
|
||||
@@ -35,10 +35,16 @@ use tracing::*;
|
||||
use utils::signals::ShutdownSignals;
|
||||
|
||||
use metrics::{Encoder, TextEncoder};
|
||||
use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
|
||||
use storage_broker::metrics::{
|
||||
BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL,
|
||||
NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL,
|
||||
};
|
||||
use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer};
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
|
||||
use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
|
||||
use storage_broker::proto::{
|
||||
FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
|
||||
SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage,
|
||||
};
|
||||
use storage_broker::{
|
||||
parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR,
|
||||
};
|
||||
@@ -73,8 +79,103 @@ struct Args {
|
||||
log_format: String,
|
||||
}
|
||||
|
||||
type PubId = u64; // id of publisher for registering in maps
|
||||
type SubId = u64; // id of subscriber for registering in maps
|
||||
/// Id of publisher for registering in maps
|
||||
type PubId = u64;
|
||||
|
||||
/// Id of subscriber for registering in maps
|
||||
type SubId = u64;
|
||||
|
||||
/// Single enum type for all messages.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[allow(clippy::enum_variant_names)]
|
||||
enum Message {
|
||||
SafekeeperTimelineInfo(SafekeeperTimelineInfo),
|
||||
SafekeeperDiscoveryRequest(SafekeeperDiscoveryRequest),
|
||||
SafekeeperDiscoveryResponse(SafekeeperDiscoveryResponse),
|
||||
}
|
||||
|
||||
impl Message {
|
||||
/// Convert proto message to internal message.
|
||||
pub fn from(proto_msg: TypedMessage) -> Result<Self, Status> {
|
||||
match proto_msg.r#type() {
|
||||
MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo(
|
||||
proto_msg.safekeeper_timeline_info.ok_or_else(|| {
|
||||
Status::new(Code::InvalidArgument, "missing safekeeper_timeline_info")
|
||||
})?,
|
||||
)),
|
||||
MessageType::SafekeeperDiscoveryRequest => Ok(Message::SafekeeperDiscoveryRequest(
|
||||
proto_msg.safekeeper_discovery_request.ok_or_else(|| {
|
||||
Status::new(
|
||||
Code::InvalidArgument,
|
||||
"missing safekeeper_discovery_request",
|
||||
)
|
||||
})?,
|
||||
)),
|
||||
MessageType::SafekeeperDiscoveryResponse => Ok(Message::SafekeeperDiscoveryResponse(
|
||||
proto_msg.safekeeper_discovery_response.ok_or_else(|| {
|
||||
Status::new(
|
||||
Code::InvalidArgument,
|
||||
"missing safekeeper_discovery_response",
|
||||
)
|
||||
})?,
|
||||
)),
|
||||
MessageType::Unknown => Err(Status::new(
|
||||
Code::InvalidArgument,
|
||||
format!("invalid message type: {:?}", proto_msg.r#type),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the tenant_timeline_id from the message.
|
||||
pub fn tenant_timeline_id(&self) -> Result<Option<TenantTimelineId>, Status> {
|
||||
match self {
|
||||
Message::SafekeeperTimelineInfo(msg) => Ok(msg
|
||||
.tenant_timeline_id
|
||||
.as_ref()
|
||||
.map(parse_proto_ttid)
|
||||
.transpose()?),
|
||||
Message::SafekeeperDiscoveryRequest(msg) => Ok(msg
|
||||
.tenant_timeline_id
|
||||
.as_ref()
|
||||
.map(parse_proto_ttid)
|
||||
.transpose()?),
|
||||
Message::SafekeeperDiscoveryResponse(msg) => Ok(msg
|
||||
.tenant_timeline_id
|
||||
.as_ref()
|
||||
.map(parse_proto_ttid)
|
||||
.transpose()?),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert internal message to the protobuf struct.
|
||||
pub fn as_typed_message(&self) -> TypedMessage {
|
||||
let mut res = TypedMessage {
|
||||
r#type: self.message_type() as i32,
|
||||
..Default::default()
|
||||
};
|
||||
match self {
|
||||
Message::SafekeeperTimelineInfo(msg) => {
|
||||
res.safekeeper_timeline_info = Some(msg.clone())
|
||||
}
|
||||
Message::SafekeeperDiscoveryRequest(msg) => {
|
||||
res.safekeeper_discovery_request = Some(msg.clone())
|
||||
}
|
||||
Message::SafekeeperDiscoveryResponse(msg) => {
|
||||
res.safekeeper_discovery_response = Some(msg.clone())
|
||||
}
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
/// Get the message type.
|
||||
pub fn message_type(&self) -> MessageType {
|
||||
match self {
|
||||
Message::SafekeeperTimelineInfo(_) => MessageType::SafekeeperTimelineInfo,
|
||||
Message::SafekeeperDiscoveryRequest(_) => MessageType::SafekeeperDiscoveryRequest,
|
||||
Message::SafekeeperDiscoveryResponse(_) => MessageType::SafekeeperDiscoveryResponse,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
enum SubscriptionKey {
|
||||
@@ -83,7 +184,7 @@ enum SubscriptionKey {
|
||||
}
|
||||
|
||||
impl SubscriptionKey {
|
||||
// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
|
||||
/// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
|
||||
pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
|
||||
match key {
|
||||
ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
|
||||
@@ -92,14 +193,29 @@ impl SubscriptionKey {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse from FilterTenantTimelineId
|
||||
pub fn from_proto_filter_tenant_timeline_id(
|
||||
f: &FilterTenantTimelineId,
|
||||
) -> Result<Self, Status> {
|
||||
if !f.enabled {
|
||||
return Ok(SubscriptionKey::All);
|
||||
}
|
||||
|
||||
let ttid =
|
||||
parse_proto_ttid(f.tenant_timeline_id.as_ref().ok_or_else(|| {
|
||||
Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
|
||||
})?)?;
|
||||
Ok(SubscriptionKey::Timeline(ttid))
|
||||
}
|
||||
}
|
||||
|
||||
// Channel to timeline subscribers.
|
||||
/// Channel to timeline subscribers.
|
||||
struct ChanToTimelineSub {
|
||||
chan: broadcast::Sender<SafekeeperTimelineInfo>,
|
||||
// Tracked separately to know when delete the shmem entry. receiver_count()
|
||||
// is unhandy for that as unregistering and dropping the receiver side
|
||||
// happens at different moments.
|
||||
chan: broadcast::Sender<Message>,
|
||||
/// Tracked separately to know when delete the shmem entry. receiver_count()
|
||||
/// is unhandy for that as unregistering and dropping the receiver side
|
||||
/// happens at different moments.
|
||||
num_subscribers: u64,
|
||||
}
|
||||
|
||||
@@ -110,7 +226,7 @@ struct SharedState {
|
||||
num_subs_to_timelines: i64,
|
||||
chans_to_timeline_subs: HashMap<TenantTimelineId, ChanToTimelineSub>,
|
||||
num_subs_to_all: i64,
|
||||
chan_to_all_subs: broadcast::Sender<SafekeeperTimelineInfo>,
|
||||
chan_to_all_subs: broadcast::Sender<Message>,
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
@@ -146,7 +262,7 @@ impl SharedState {
|
||||
&mut self,
|
||||
sub_key: SubscriptionKey,
|
||||
timeline_chan_size: usize,
|
||||
) -> (SubId, broadcast::Receiver<SafekeeperTimelineInfo>) {
|
||||
) -> (SubId, broadcast::Receiver<Message>) {
|
||||
let sub_id = self.next_sub_id;
|
||||
self.next_sub_id += 1;
|
||||
let sub_rx = match sub_key {
|
||||
@@ -262,6 +378,29 @@ impl Registry {
|
||||
subscriber.id, subscriber.key, subscriber.remote_addr
|
||||
);
|
||||
}
|
||||
|
||||
/// Send msg to relevant subscribers.
|
||||
pub fn send_msg(&self, msg: &Message) -> Result<(), Status> {
|
||||
PROCESSED_MESSAGES_TOTAL.inc();
|
||||
|
||||
// send message to subscribers for everything
|
||||
let shared_state = self.shared_state.read();
|
||||
// Err means there is no subscribers, it is fine.
|
||||
shared_state.chan_to_all_subs.send(msg.clone()).ok();
|
||||
|
||||
// send message to per timeline subscribers, if there is ttid
|
||||
let ttid = msg.tenant_timeline_id()?;
|
||||
if let Some(ttid) = ttid {
|
||||
if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
|
||||
// Err can't happen here, as tx is destroyed only after removing
|
||||
// from the map the last subscriber along with tx.
|
||||
subs.chan
|
||||
.send(msg.clone())
|
||||
.expect("rx is still in the map with zero subscribers");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Private subscriber state.
|
||||
@@ -269,7 +408,7 @@ struct Subscriber {
|
||||
id: SubId,
|
||||
key: SubscriptionKey,
|
||||
// Subscriber receives messages from publishers here.
|
||||
sub_rx: broadcast::Receiver<SafekeeperTimelineInfo>,
|
||||
sub_rx: broadcast::Receiver<Message>,
|
||||
// to unregister itself from shared state in Drop
|
||||
registry: Registry,
|
||||
// for logging
|
||||
@@ -291,26 +430,9 @@ struct Publisher {
|
||||
}
|
||||
|
||||
impl Publisher {
|
||||
// Send msg to relevant subscribers.
|
||||
pub fn send_msg(&mut self, msg: &SafekeeperTimelineInfo) -> Result<(), Status> {
|
||||
// send message to subscribers for everything
|
||||
let shared_state = self.registry.shared_state.read();
|
||||
// Err means there is no subscribers, it is fine.
|
||||
shared_state.chan_to_all_subs.send(msg.clone()).ok();
|
||||
|
||||
// send message to per timeline subscribers
|
||||
let ttid =
|
||||
parse_proto_ttid(msg.tenant_timeline_id.as_ref().ok_or_else(|| {
|
||||
Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
|
||||
})?)?;
|
||||
if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
|
||||
// Err can't happen here, as tx is destroyed only after removing
|
||||
// from the map the last subscriber along with tx.
|
||||
subs.chan
|
||||
.send(msg.clone())
|
||||
.expect("rx is still in the map with zero subscribers");
|
||||
}
|
||||
Ok(())
|
||||
/// Send msg to relevant subscribers.
|
||||
pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> {
|
||||
self.registry.send_msg(msg)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -339,7 +461,7 @@ impl BrokerService for Broker {
|
||||
|
||||
loop {
|
||||
match stream.next().await {
|
||||
Some(Ok(msg)) => publisher.send_msg(&msg)?,
|
||||
Some(Ok(msg)) => publisher.send_msg(&Message::SafekeeperTimelineInfo(msg))?,
|
||||
Some(Err(e)) => return Err(e), // grpc error from the stream
|
||||
None => break, // closed stream
|
||||
}
|
||||
@@ -371,8 +493,15 @@ impl BrokerService for Broker {
|
||||
let mut missed_msgs: u64 = 0;
|
||||
loop {
|
||||
match subscriber.sub_rx.recv().await {
|
||||
Ok(info) => yield info,
|
||||
Ok(info) => {
|
||||
match info {
|
||||
Message::SafekeeperTimelineInfo(info) => yield info,
|
||||
_ => {},
|
||||
}
|
||||
BROADCASTED_MESSAGES_TOTAL.inc();
|
||||
},
|
||||
Err(RecvError::Lagged(skipped_msg)) => {
|
||||
BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg);
|
||||
missed_msgs += skipped_msg;
|
||||
if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
|
||||
warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
|
||||
@@ -392,6 +521,78 @@ impl BrokerService for Broker {
|
||||
Box::pin(output) as Self::SubscribeSafekeeperInfoStream
|
||||
))
|
||||
}
|
||||
|
||||
type SubscribeByFilterStream =
|
||||
Pin<Box<dyn Stream<Item = Result<TypedMessage, Status>> + Send + 'static>>;
|
||||
|
||||
/// Subscribe to all messages, limited by a filter.
|
||||
async fn subscribe_by_filter(
|
||||
&self,
|
||||
request: Request<SubscribeByFilterRequest>,
|
||||
) -> std::result::Result<Response<Self::SubscribeByFilterStream>, Status> {
|
||||
let remote_addr = request
|
||||
.remote_addr()
|
||||
.expect("TCPConnectInfo inserted by handler");
|
||||
let proto_filter = request.into_inner();
|
||||
let ttid_filter = proto_filter
|
||||
.tenant_timeline_id
|
||||
.as_ref()
|
||||
.ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
|
||||
|
||||
let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
|
||||
let types_set = proto_filter
|
||||
.types
|
||||
.iter()
|
||||
.map(|t| t.r#type)
|
||||
.collect::<std::collections::HashSet<_>>();
|
||||
|
||||
let mut subscriber = self.registry.register_subscriber(sub_key, remote_addr);
|
||||
|
||||
// transform rx into stream with item = Result, as method result demands
|
||||
let output = async_stream::try_stream! {
|
||||
let mut warn_interval = time::interval(Duration::from_millis(1000));
|
||||
let mut missed_msgs: u64 = 0;
|
||||
loop {
|
||||
match subscriber.sub_rx.recv().await {
|
||||
Ok(msg) => {
|
||||
let msg_type = msg.message_type() as i32;
|
||||
if types_set.contains(&msg_type) {
|
||||
yield msg.as_typed_message();
|
||||
BROADCASTED_MESSAGES_TOTAL.inc();
|
||||
}
|
||||
},
|
||||
Err(RecvError::Lagged(skipped_msg)) => {
|
||||
BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg);
|
||||
missed_msgs += skipped_msg;
|
||||
if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
|
||||
warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
|
||||
subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
|
||||
missed_msgs = 0;
|
||||
}
|
||||
}
|
||||
Err(RecvError::Closed) => {
|
||||
// can't happen, we never drop the channel while there is a subscriber
|
||||
Err(Status::new(Code::Internal, "channel unexpectantly closed"))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Response::new(
|
||||
Box::pin(output) as Self::SubscribeByFilterStream
|
||||
))
|
||||
}
|
||||
|
||||
/// Publish one message.
|
||||
async fn publish_one(
|
||||
&self,
|
||||
request: Request<TypedMessage>,
|
||||
) -> std::result::Result<Response<()>, Status> {
|
||||
let msg = Message::from(request.into_inner())?;
|
||||
PUBLISHED_ONEOFF_MESSAGES_TOTAL.inc();
|
||||
self.registry.send_msg(&msg)?;
|
||||
Ok(Response::new(()))
|
||||
}
|
||||
}
|
||||
|
||||
// We serve only metrics and healthcheck through http1.
|
||||
@@ -515,8 +716,8 @@ mod tests {
|
||||
use tokio::sync::broadcast::error::TryRecvError;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
fn msg(timeline_id: Vec<u8>) -> SafekeeperTimelineInfo {
|
||||
SafekeeperTimelineInfo {
|
||||
fn msg(timeline_id: Vec<u8>) -> Message {
|
||||
Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo {
|
||||
safekeeper_id: 1,
|
||||
tenant_timeline_id: Some(ProtoTenantTimelineId {
|
||||
tenant_id: vec![0x00; 16],
|
||||
@@ -533,7 +734,7 @@ mod tests {
|
||||
http_connstr: "neon-1-sk-1.local:7677".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
availability_zone: None,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn tli_from_u64(i: u64) -> Vec<u8> {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//! Broker metrics.
|
||||
|
||||
use metrics::{register_int_gauge, IntGauge};
|
||||
use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub static NUM_PUBS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
@@ -23,3 +23,35 @@ pub static NUM_SUBS_ALL: Lazy<IntGauge> = Lazy::new(|| {
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub static PROCESSED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"storage_broker_processed_messages_total",
|
||||
"Number of messages received by storage broker, before routing and broadcasting"
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub static BROADCASTED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"storage_broker_broadcasted_messages_total",
|
||||
"Number of messages broadcasted (sent over network) to subscribers"
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub static BROADCAST_DROPPED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"storage_broker_broadcast_dropped_messages_total",
|
||||
"Number of messages dropped due to channel capacity overflow"
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
pub static PUBLISHED_ONEOFF_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"storage_broker_published_oneoff_messages_total",
|
||||
"Number of one-off messages sent via PublishOne method"
|
||||
)
|
||||
.expect("Failed to register metric")
|
||||
});
|
||||
|
||||
@@ -365,6 +365,12 @@ class PgProtocol:
|
||||
result.append(cur.fetchall())
|
||||
return result
|
||||
|
||||
def safe_psql_scalar(self, query) -> Any:
|
||||
"""
|
||||
Execute query returning single row with single column.
|
||||
"""
|
||||
return self.safe_psql(query)[0][0]
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuthKeys:
|
||||
@@ -457,7 +463,6 @@ class NeonEnvBuilder:
|
||||
self.preserve_database_files = preserve_database_files
|
||||
self.initial_tenant = initial_tenant or TenantId.generate()
|
||||
self.initial_timeline = initial_timeline or TimelineId.generate()
|
||||
self.enable_generations = True
|
||||
self.scrub_on_exit = False
|
||||
self.test_output_dir = test_output_dir
|
||||
|
||||
@@ -677,8 +682,7 @@ class NeonEnvBuilder:
|
||||
|
||||
pageserver.stop(immediate=True)
|
||||
|
||||
if self.env.attachment_service is not None:
|
||||
self.env.attachment_service.stop(immediate=True)
|
||||
self.env.attachment_service.stop(immediate=True)
|
||||
|
||||
cleanup_error = None
|
||||
|
||||
@@ -772,13 +776,9 @@ class NeonEnv:
|
||||
self.initial_tenant = config.initial_tenant
|
||||
self.initial_timeline = config.initial_timeline
|
||||
|
||||
if config.enable_generations:
|
||||
attachment_service_port = self.port_distributor.get_port()
|
||||
self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}"
|
||||
self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self)
|
||||
else:
|
||||
self.control_plane_api = None
|
||||
self.attachment_service = None
|
||||
attachment_service_port = self.port_distributor.get_port()
|
||||
self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
|
||||
self.attachment_service: NeonAttachmentService = NeonAttachmentService(self)
|
||||
|
||||
# Create a config file corresponding to the options
|
||||
cfg: Dict[str, Any] = {
|
||||
@@ -851,8 +851,7 @@ class NeonEnv:
|
||||
# Start up broker, pageserver and all safekeepers
|
||||
self.broker.try_start()
|
||||
|
||||
if self.attachment_service is not None:
|
||||
self.attachment_service.start()
|
||||
self.attachment_service.start()
|
||||
|
||||
for pageserver in self.pageservers:
|
||||
pageserver.start()
|
||||
@@ -1834,20 +1833,19 @@ class NeonPageserver(PgProtocol):
|
||||
"""
|
||||
client = self.http_client()
|
||||
return client.tenant_attach(
|
||||
tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
|
||||
tenant_id,
|
||||
config,
|
||||
config_null,
|
||||
generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id),
|
||||
)
|
||||
|
||||
def tenant_detach(self, tenant_id: TenantId):
|
||||
if self.env.attachment_service is not None:
|
||||
self.env.attachment_service.attach_hook_drop(tenant_id)
|
||||
self.env.attachment_service.attach_hook_drop(tenant_id)
|
||||
|
||||
client = self.http_client()
|
||||
return client.tenant_detach(tenant_id)
|
||||
|
||||
def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
|
||||
# This API is only for use when generations are enabled
|
||||
assert self.env.attachment_service is not None
|
||||
|
||||
if config["mode"].startswith("Attached") and "generation" not in config:
|
||||
config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
|
||||
@@ -1873,26 +1871,15 @@ class NeonPageserver(PgProtocol):
|
||||
generation: Optional[int] = None,
|
||||
) -> TenantId:
|
||||
if generation is None:
|
||||
generation = self.maybe_get_generation(tenant_id)
|
||||
generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
client = self.http_client(auth_token=auth_token)
|
||||
return client.tenant_create(tenant_id, conf, generation=generation)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
client = self.http_client()
|
||||
return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
|
||||
|
||||
def maybe_get_generation(self, tenant_id: TenantId):
|
||||
"""
|
||||
For tests that would like to use an HTTP client directly instead of using
|
||||
the `tenant_attach` and `tenant_create` helpers here: issue a generation
|
||||
number for a tenant.
|
||||
|
||||
Returns None if the attachment service is not enabled (legacy mode)
|
||||
"""
|
||||
if self.env.attachment_service is not None:
|
||||
return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
else:
|
||||
return None
|
||||
return client.tenant_load(
|
||||
tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
|
||||
)
|
||||
|
||||
|
||||
def append_pageserver_param_overrides(
|
||||
@@ -2752,6 +2739,13 @@ class Endpoint(PgProtocol):
|
||||
):
|
||||
self.stop()
|
||||
|
||||
# Checkpoints running endpoint and returns pg_wal size in MB.
|
||||
def get_pg_wal_size(self):
|
||||
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
|
||||
self.safe_psql("checkpoint")
|
||||
assert self.pgdata_dir is not None # please mypy
|
||||
return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
|
||||
|
||||
|
||||
class EndpointFactory:
|
||||
"""An object representing multiple compute endpoints."""
|
||||
@@ -2950,6 +2944,13 @@ class Safekeeper:
|
||||
return segments
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@dataclass
|
||||
class Walreceiver:
|
||||
conn_id: int
|
||||
state: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
acceptor_epoch: int
|
||||
@@ -2960,6 +2961,7 @@ class SafekeeperTimelineStatus:
|
||||
backup_lsn: Lsn
|
||||
peer_horizon_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
walreceivers: List[Walreceiver]
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -3021,6 +3023,7 @@ class SafekeeperHttpClient(requests.Session):
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
return SafekeeperTimelineStatus(
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
@@ -3030,6 +3033,7 @@ class SafekeeperHttpClient(requests.Session):
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
walreceivers=walreceivers,
|
||||
)
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
|
||||
|
||||
@@ -125,3 +125,51 @@ class TenantId(Id):
|
||||
class TimelineId(Id):
|
||||
def __repr__(self) -> str:
|
||||
return f'TimelineId("{self.id.hex()}")'
|
||||
|
||||
|
||||
# Workaround for compat with python 3.9, which does not have `typing.Self`
|
||||
TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
|
||||
|
||||
|
||||
class TenantShardId:
|
||||
def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
|
||||
self.tenant_id = tenant_id
|
||||
self.shard_number = shard_number
|
||||
self.shard_count = shard_count
|
||||
assert self.shard_number < self.shard_count or self.shard_count == 0
|
||||
|
||||
@classmethod
|
||||
def parse(cls: Type[TTenantShardId], input) -> TTenantShardId:
|
||||
if len(input) == 32:
|
||||
return cls(
|
||||
tenant_id=TenantId(input),
|
||||
shard_number=0,
|
||||
shard_count=0,
|
||||
)
|
||||
elif len(input) == 37:
|
||||
return cls(
|
||||
tenant_id=TenantId(input[0:32]),
|
||||
shard_number=int(input[33:35], 16),
|
||||
shard_count=int(input[35:37], 16),
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid TenantShardId '{input}'")
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
|
||||
|
||||
def _tuple(self) -> tuple[TenantId, int, int]:
|
||||
return (self.tenant_id, self.shard_number, self.shard_count)
|
||||
|
||||
def __lt__(self, other) -> bool:
|
||||
if not isinstance(other, type(self)):
|
||||
return NotImplemented
|
||||
return self._tuple() < other._tuple()
|
||||
|
||||
def __eq__(self, other) -> bool:
|
||||
if not isinstance(other, type(self)):
|
||||
return NotImplemented
|
||||
return self._tuple() == other._tuple()
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash(self._tuple())
|
||||
|
||||
@@ -61,7 +61,6 @@ def measure_recovery_time(env: NeonCompare):
|
||||
# of view, but the same as far as the safekeeper/WAL is concerned. To work around that,
|
||||
# we will explicitly create the tenant in the same generation that it was previously
|
||||
# attached in.
|
||||
assert env.env.attachment_service is not None
|
||||
attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
|
||||
assert attach_status is not None
|
||||
(attach_gen, _) = attach_status
|
||||
|
||||
@@ -151,7 +151,9 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale:
|
||||
An OLAP-style ClickHouse benchmark
|
||||
|
||||
Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
|
||||
The DB prepared manually in advance
|
||||
The DB prepared manually in advance.
|
||||
Important: after intial data load, run `VACUUM (DISABLE_PAGE_SKIPPING, FREEZE, ANALYZE) hits;`
|
||||
to ensure that Postgres optimizer chooses the same plans as RDS and Aurora.
|
||||
"""
|
||||
explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true"
|
||||
|
||||
|
||||
@@ -136,10 +136,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
|
||||
ps_http.tenant_detach(tenant_id)
|
||||
assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
|
||||
|
||||
body = {}
|
||||
gen = env.pageserver.maybe_get_generation(tenant_id)
|
||||
if gen is not None:
|
||||
body["generation"] = gen
|
||||
body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
|
||||
|
||||
ps_http.post(
|
||||
f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
|
||||
|
||||
@@ -87,7 +87,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
#
|
||||
# Since we're dual-attached, need to tip-off attachment service to treat the one we're
|
||||
# about to start as the attached pageserver
|
||||
assert env.attachment_service is not None
|
||||
env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
|
||||
env.pageservers[0].start()
|
||||
env.pageservers[1].stop()
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import enum
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Tuple
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
import pytest
|
||||
import toml
|
||||
@@ -64,6 +65,23 @@ def test_min_resident_size_override_handling(
|
||||
assert_config(tenant_id, None, config_level_override)
|
||||
|
||||
|
||||
@enum.unique
|
||||
class EvictionOrder(str, enum.Enum):
|
||||
ABSOLUTE_ORDER = "absolute"
|
||||
RELATIVE_ORDER_EQUAL = "relative_equal"
|
||||
RELATIVE_ORDER_SPARE = "relative_spare"
|
||||
|
||||
def config(self) -> Dict[str, Any]:
|
||||
if self == EvictionOrder.ABSOLUTE_ORDER:
|
||||
return {"type": "AbsoluteAccessed"}
|
||||
elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
|
||||
return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}}
|
||||
elif self == EvictionOrder.RELATIVE_ORDER_SPARE:
|
||||
return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}}
|
||||
else:
|
||||
raise RuntimeError(f"not implemented: {self}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvictionEnv:
|
||||
timelines: list[Tuple[TenantId, TimelineId]]
|
||||
@@ -108,13 +126,14 @@ class EvictionEnv:
|
||||
_avg = cur.fetchone()
|
||||
|
||||
def pageserver_start_with_disk_usage_eviction(
|
||||
self, period, max_usage_pct, min_avail_bytes, mock_behavior
|
||||
self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder
|
||||
):
|
||||
disk_usage_config = {
|
||||
"period": period,
|
||||
"max_usage_pct": max_usage_pct,
|
||||
"min_avail_bytes": min_avail_bytes,
|
||||
"mock_statvfs": mock_behavior,
|
||||
"eviction_order": eviction_order.config(),
|
||||
}
|
||||
|
||||
enc = toml.TomlEncoder()
|
||||
@@ -270,7 +289,13 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
|
||||
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
|
||||
|
||||
|
||||
def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv):
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
)
|
||||
def test_pageserver_evicts_until_pressure_is_relieved(
|
||||
eviction_env: EvictionEnv, order: EvictionOrder
|
||||
):
|
||||
"""
|
||||
Basic test to ensure that we evict enough to relieve pressure.
|
||||
"""
|
||||
@@ -281,7 +306,9 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
|
||||
|
||||
target = total_on_disk // 2
|
||||
|
||||
response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
response = pageserver_http.disk_usage_eviction_run(
|
||||
{"evict_bytes": target, "eviction_order": order.config()}
|
||||
)
|
||||
log.info(f"{response}")
|
||||
|
||||
(later_total_on_disk, _, _) = env.timelines_du()
|
||||
@@ -296,7 +323,13 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
|
||||
assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
|
||||
|
||||
|
||||
def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv):
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
)
|
||||
def test_pageserver_respects_overridden_resident_size(
|
||||
eviction_env: EvictionEnv, order: EvictionOrder
|
||||
):
|
||||
"""
|
||||
Override tenant min resident and ensure that it will be respected by eviction.
|
||||
"""
|
||||
@@ -336,7 +369,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
|
||||
env.warm_up_tenant(large_tenant[0])
|
||||
|
||||
# do one run
|
||||
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
response = ps_http.disk_usage_eviction_run(
|
||||
{"evict_bytes": target, "eviction_order": order.config()}
|
||||
)
|
||||
log.info(f"{response}")
|
||||
|
||||
time.sleep(1) # give log time to flush
|
||||
@@ -365,7 +400,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
|
||||
assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target
|
||||
|
||||
|
||||
def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
|
||||
)
|
||||
def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
|
||||
"""
|
||||
If we can't relieve pressure using tenant_min_resident_size-respecting eviction,
|
||||
we should continue to evict layers following global LRU.
|
||||
@@ -376,7 +415,9 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
|
||||
(total_on_disk, _, _) = env.timelines_du()
|
||||
target = total_on_disk
|
||||
|
||||
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
response = ps_http.disk_usage_eviction_run(
|
||||
{"evict_bytes": target, "eviction_order": order.config()}
|
||||
)
|
||||
log.info(f"{response}")
|
||||
|
||||
(later_total_on_disk, _, _) = env.timelines_du()
|
||||
@@ -389,7 +430,15 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
|
||||
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
|
||||
|
||||
|
||||
def test_partial_evict_tenant(eviction_env: EvictionEnv):
|
||||
@pytest.mark.parametrize(
|
||||
"order",
|
||||
[
|
||||
EvictionOrder.ABSOLUTE_ORDER,
|
||||
EvictionOrder.RELATIVE_ORDER_EQUAL,
|
||||
EvictionOrder.RELATIVE_ORDER_SPARE,
|
||||
],
|
||||
)
|
||||
def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
|
||||
"""
|
||||
Warm up a tenant, then build up pressure to cause in evictions in both.
|
||||
We expect
|
||||
@@ -402,7 +451,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
|
||||
(total_on_disk, _, _) = env.timelines_du()
|
||||
du_by_timeline = env.du_by_timeline()
|
||||
|
||||
# pick any tenant
|
||||
# pick smaller or greater (iteration order is insertion order of scale=4 and scale=6)
|
||||
[warm, cold] = list(du_by_timeline.keys())
|
||||
(tenant_id, timeline_id) = warm
|
||||
|
||||
@@ -413,7 +462,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
|
||||
# but not enough to fall into global LRU.
|
||||
# So, set target to all occupied space, except 2*env.layer_size per tenant
|
||||
target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
|
||||
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
|
||||
response = ps_http.disk_usage_eviction_run(
|
||||
{"evict_bytes": target, "eviction_order": order.config()}
|
||||
)
|
||||
log.info(f"{response}")
|
||||
|
||||
(later_total_on_disk, _, _) = env.timelines_du()
|
||||
@@ -428,28 +479,32 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
|
||||
), "all tenants should have lost some layers"
|
||||
|
||||
warm_size = later_du_by_timeline[warm]
|
||||
|
||||
# bounds for warmed_size
|
||||
warm_lower = 0.5 * du_by_timeline[warm]
|
||||
|
||||
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
|
||||
# So, check for up to 3 here.
|
||||
warm_upper = warm_lower + 3 * env.layer_size
|
||||
|
||||
cold_size = later_du_by_timeline[cold]
|
||||
cold_upper = 2 * env.layer_size
|
||||
|
||||
log.info(
|
||||
f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
|
||||
)
|
||||
log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
|
||||
if order == EvictionOrder.ABSOLUTE_ORDER:
|
||||
# bounds for warmed_size
|
||||
warm_lower = 0.5 * du_by_timeline[warm]
|
||||
|
||||
assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
|
||||
assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
|
||||
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
|
||||
# So, check for up to 3 here.
|
||||
warm_upper = warm_lower + 3 * env.layer_size
|
||||
|
||||
assert (
|
||||
cold_size < cold_upper
|
||||
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
|
||||
cold_upper = 2 * env.layer_size
|
||||
log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
|
||||
log.info(
|
||||
f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
|
||||
)
|
||||
log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
|
||||
|
||||
assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
|
||||
assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
|
||||
|
||||
assert (
|
||||
cold_size < cold_upper
|
||||
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
|
||||
else:
|
||||
# just go with the space was freed, find proper limits later
|
||||
pass
|
||||
|
||||
|
||||
def poor_mans_du(
|
||||
@@ -501,6 +556,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
|
||||
"type": "Failure",
|
||||
"mocked_error": "EIO",
|
||||
},
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
|
||||
@@ -533,6 +589,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
|
||||
# This avoids accounting for metadata files & tenant conf in the tests.
|
||||
"name_filter": ".*__.*",
|
||||
},
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
def relieved_log_message():
|
||||
@@ -573,6 +630,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
|
||||
# This avoids accounting for metadata files & tenant conf in the tests.
|
||||
"name_filter": ".*__.*",
|
||||
},
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
def relieved_log_message():
|
||||
|
||||
@@ -157,7 +157,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites
|
||||
|
||||
def get_generation_number():
|
||||
assert env.attachment_service is not None
|
||||
attachment = env.attachment_service.inspect(tenant_id)
|
||||
assert attachment is not None
|
||||
return attachment[0]
|
||||
|
||||
@@ -72,7 +72,9 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
|
||||
|
||||
# create new tenant and check it is also there
|
||||
tenant_id = TenantId.generate()
|
||||
client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id))
|
||||
client.tenant_create(
|
||||
tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
|
||||
)
|
||||
assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
|
||||
|
||||
timelines = client.timeline_list(tenant_id)
|
||||
|
||||
@@ -187,7 +187,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
- After upgrade, the bucket should contain a mixture.
|
||||
- In both cases, postgres I/O should work.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
@@ -196,7 +195,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
env.broker.try_start()
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
assert env.attachment_service is not None
|
||||
env.attachment_service.start()
|
||||
|
||||
env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
|
||||
@@ -262,12 +260,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
|
||||
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
|
||||
some_other_pageserver = 1234
|
||||
ps_http = env.pageserver.http_client()
|
||||
@@ -341,7 +337,6 @@ def test_deletion_queue_recovery(
|
||||
:param validate_before: whether to wait for deletions to be validated before restart. This
|
||||
makes them elegible to be executed after restart, if the same node keeps the attachment.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
@@ -405,7 +400,6 @@ def test_deletion_queue_recovery(
|
||||
|
||||
if keep_attachment == KeepAttachment.LOSE:
|
||||
some_other_pageserver = 101010
|
||||
assert env.attachment_service is not None
|
||||
env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
|
||||
|
||||
env.pageserver.start()
|
||||
@@ -453,7 +447,6 @@ def test_deletion_queue_recovery(
|
||||
|
||||
|
||||
def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
@@ -473,7 +466,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
)
|
||||
|
||||
# Simulate a major incident: the control plane goes offline
|
||||
assert env.attachment_service is not None
|
||||
env.attachment_service.stop()
|
||||
|
||||
# Remember how many validations had happened before the control plane went offline
|
||||
@@ -545,7 +537,6 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
|
||||
and must be constructed using the proper generation for the layer, which may not be the same generation
|
||||
that the tenant is running in.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
@@ -575,7 +566,6 @@ def test_multi_attach(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
|
||||
@@ -9,9 +9,7 @@ from fixtures.utils import wait_until
|
||||
|
||||
# Test restarting page server, while safekeeper and compute node keep
|
||||
# running.
|
||||
@pytest.mark.parametrize("generations", [True, False])
|
||||
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool):
|
||||
neon_env_builder.enable_generations = generations
|
||||
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
|
||||
|
||||
@@ -57,13 +57,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
|
||||
states are valid, so that we may test it in this way: the API should always
|
||||
work as long as the tenant exists.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.num_pageservers = 3
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
|
||||
pageservers = env.pageservers
|
||||
list([p.http_client() for p in pageservers])
|
||||
@@ -210,13 +208,11 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test the sequence of location states that are used in a live migration.
|
||||
"""
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.num_pageservers = 2
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
@@ -60,8 +60,6 @@ def test_remote_storage_backup_and_restore(
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
neon_env_builder.enable_generations = generations
|
||||
|
||||
# Exercise retry code path by making all uploads and downloads fail for the
|
||||
# first time. The retries print INFO-messages to the log; we will check
|
||||
# that they are present after the test.
|
||||
|
||||
@@ -263,15 +263,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
ps_http, env.initial_tenant, timeline_id, iterations=iterations
|
||||
)
|
||||
|
||||
if failpoint == "timeline-delete-after-index-delete":
|
||||
m = ps_http.get_metrics()
|
||||
assert (
|
||||
m.query_one(
|
||||
"remote_storage_s3_request_seconds_count",
|
||||
filter={"request_type": "get_object", "result": "ok"},
|
||||
).value
|
||||
== 1 # index part for initial timeline
|
||||
)
|
||||
elif check is Check.RETRY_WITHOUT_RESTART:
|
||||
# this should succeed
|
||||
# this also checks that delete can be retried even when timeline is in Broken state
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import concurrent.futures
|
||||
import math
|
||||
import queue
|
||||
import random
|
||||
@@ -24,6 +25,7 @@ from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
timeline_delete_wait_completed,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_tenant_status_404,
|
||||
wait_until_tenant_active,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
@@ -776,6 +778,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
def get_tenant_states():
|
||||
states = {}
|
||||
log.info(f"Tenant ids: {tenant_ids}")
|
||||
for tenant_id in tenant_ids:
|
||||
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
|
||||
states[tenant_id] = tenant["state"]["slug"]
|
||||
@@ -872,3 +875,51 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
|
||||
)
|
||||
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
|
||||
|
||||
# Check that tenant deletion proactively wakes tenants: this is done separately to the main
|
||||
# body of the test because it will disrupt tenant counts
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start(
|
||||
extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
|
||||
)
|
||||
|
||||
wait_until(10, 1, at_least_one_active)
|
||||
delete_tenant_id = list(
|
||||
[(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
|
||||
)[0][0]
|
||||
|
||||
# Deleting a stuck tenant should prompt it to go active
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
log.info("Starting background delete")
|
||||
|
||||
def delete_tenant():
|
||||
env.pageserver.http_client().tenant_delete(delete_tenant_id)
|
||||
|
||||
background_delete = executor.submit(delete_tenant)
|
||||
|
||||
# Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
|
||||
# logical size is paused in a failpoint. So instead we will use a log observation to check that
|
||||
# on-demand activation was triggered by the tenant deletion
|
||||
log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*"
|
||||
|
||||
def activated_on_demand():
|
||||
assert env.pageserver.log_contains(log_match) is not None
|
||||
|
||||
log.info(f"Waiting for activation message '{log_match}'")
|
||||
try:
|
||||
wait_until(10, 1, activated_on_demand)
|
||||
finally:
|
||||
log.info("Clearing failpoint")
|
||||
pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
|
||||
|
||||
# Deletion should complete successfully now that failpoint is unblocked
|
||||
log.info("Joining background delete")
|
||||
background_delete.result(timeout=10)
|
||||
|
||||
# Poll for deletion to complete
|
||||
wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
|
||||
tenant_ids.remove(delete_tenant_id)
|
||||
|
||||
# Check that all the stuck tenants proceed to active (apart from the one that deletes)
|
||||
wait_until(10, 1, all_active)
|
||||
assert len(get_tenant_states()) == n_tenants - 1
|
||||
|
||||
@@ -419,7 +419,8 @@ def wait(f, desc, timeout=30, wait_f=None):
|
||||
try:
|
||||
if f():
|
||||
break
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
log.info(f"got exception while waiting for {desc}: {e}")
|
||||
pass
|
||||
elapsed = time.time() - started_at
|
||||
if elapsed > timeout:
|
||||
@@ -1001,8 +1002,40 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
|
||||
endpoint.start()
|
||||
|
||||
|
||||
# Context manager which logs passed time on exit.
|
||||
class DurationLogger:
|
||||
def __init__(self, desc):
|
||||
self.desc = desc
|
||||
|
||||
def __enter__(self):
|
||||
self.ts_before = time.time()
|
||||
|
||||
def __exit__(self, *exc):
|
||||
log.info(f"{self.desc} finished in {time.time() - self.ts_before}s")
|
||||
|
||||
|
||||
# Context manager which logs WAL position change on exit.
|
||||
class WalChangeLogger:
|
||||
def __init__(self, ep, desc_before):
|
||||
self.ep = ep
|
||||
self.desc_before = desc_before
|
||||
|
||||
def __enter__(self):
|
||||
self.ts_before = time.time()
|
||||
self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
|
||||
log.info(f"{self.desc_before}, lsn_before={self.lsn_before}")
|
||||
|
||||
def __exit__(self, *exc):
|
||||
lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
|
||||
log.info(
|
||||
f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s"
|
||||
)
|
||||
|
||||
|
||||
# Test that we can create timeline with one safekeeper down and initialize it
|
||||
# later when some data already had been written.
|
||||
# later when some data already had been written. It is strictly weaker than
|
||||
# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute
|
||||
# download (recovery) and as such useful for development/testing.
|
||||
def test_late_init(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -1010,12 +1043,13 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
|
||||
sk1 = env.safekeepers[0]
|
||||
sk1.stop()
|
||||
|
||||
# create and insert smth while safekeeper is down...
|
||||
env.neon_cli.create_branch("test_late_init")
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_late_init")
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
# create and insert smth while safekeeper is down...
|
||||
endpoint.safe_psql("create table t(key int, value text)")
|
||||
endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
|
||||
log.info("insert with safekeeper down done")
|
||||
with WalChangeLogger(endpoint, "doing insert with sk1 down"):
|
||||
endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
|
||||
endpoint.stop() # stop compute
|
||||
|
||||
# stop another safekeeper, and start one which missed timeline creation
|
||||
@@ -1024,28 +1058,213 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
|
||||
sk1.start()
|
||||
|
||||
# insert some more
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
with DurationLogger("recovery"):
|
||||
endpoint = env.endpoints.create_start("test_late_init")
|
||||
endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
|
||||
|
||||
wait_flush_lsn_align_by_ep(
|
||||
env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]]
|
||||
)
|
||||
# Check that WALs are the same.
|
||||
cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id)
|
||||
|
||||
|
||||
# is timeline flush_lsn equal on provided safekeepers?
|
||||
def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
|
||||
status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}"
|
||||
def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
|
||||
flush_lsns = [
|
||||
sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn
|
||||
for sk_http_cli in sk_http_clis
|
||||
]
|
||||
log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}")
|
||||
return all([flush_lsns[0] == flsn for flsn in flush_lsns])
|
||||
|
||||
|
||||
def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
status = sk_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
|
||||
return len(status.walreceivers) == 0
|
||||
|
||||
|
||||
# Assert by xxd that WAL on given safekeepers is identical. No compute must be
|
||||
# running for this to be reliable.
|
||||
def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
|
||||
assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed"
|
||||
sk_http_clis = [sk.http_client() for sk in sks]
|
||||
|
||||
# First check that term / flush_lsn are the same: it is easier to
|
||||
# report/understand if WALs are different due to that.
|
||||
statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
|
||||
term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
|
||||
for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
|
||||
assert (
|
||||
term_flush_lsns[0] == tfl
|
||||
), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
|
||||
|
||||
# check that WALs are identic.
|
||||
segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
|
||||
for cmp_segs, sk in zip(segs[1:], sks[1:]):
|
||||
assert (
|
||||
segs[0] == cmp_segs
|
||||
), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}"
|
||||
log.info(f"comparing segs {segs[0]}")
|
||||
|
||||
sk0 = sks[0]
|
||||
for sk in sks[1:]:
|
||||
(_, mismatch, not_regular) = filecmp.cmpfiles(
|
||||
sk0.timeline_dir(tenant_id, timeline_id),
|
||||
sk.timeline_dir(tenant_id, timeline_id),
|
||||
segs[0],
|
||||
shallow=False,
|
||||
)
|
||||
log.info(
|
||||
f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
|
||||
)
|
||||
|
||||
for f in mismatch:
|
||||
f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
|
||||
f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
|
||||
stdout_filename = "{}.filediff".format(f2)
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
|
||||
subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
|
||||
|
||||
cmd = "diff {}.hex {}.hex".format(f1, f2)
|
||||
subprocess.run([cmd], stdout=stdout_f, shell=True)
|
||||
|
||||
assert (mismatch, not_regular) == (
|
||||
[],
|
||||
[],
|
||||
), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic"
|
||||
|
||||
|
||||
# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is
|
||||
# running. ep is stopped by this function. This is used in tests which check
|
||||
# binary equality of WAL segments on safekeepers; which is inherently racy as
|
||||
# shutting down endpoint might always write some WAL which can get to only one
|
||||
# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if
|
||||
# it has changed.
|
||||
def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks):
|
||||
sk_http_clis = [sk.http_client() for sk in sks]
|
||||
# First wait for the alignment.
|
||||
wait(
|
||||
partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id),
|
||||
"flush_lsn to get aligned",
|
||||
)
|
||||
return status1.flush_lsn == status2.flush_lsn
|
||||
ep.stop() # then stop endpoint
|
||||
# Even if there is no compute, there might be some in flight data; ensure
|
||||
# all walreceivers die before rechecking.
|
||||
for sk_http_cli in sk_http_clis:
|
||||
wait(
|
||||
partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id),
|
||||
"walreceivers to be gone",
|
||||
)
|
||||
# Now recheck again flush_lsn and exit if it is good
|
||||
if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
|
||||
return
|
||||
# Otherwise repeat.
|
||||
log.info("flush_lsn changed during endpoint shutdown; retrying alignment")
|
||||
ep = env.endpoints.create_start(branch)
|
||||
|
||||
|
||||
# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
|
||||
# 1) walproposer can't recover node if it misses WAL written by previous computes, but
|
||||
# still starts up and functions normally if two other sks are ok.
|
||||
# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
|
||||
# normally if two other sks are ok.
|
||||
# 3) Lagged safekeeper can still recover by peer recovery.
|
||||
def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
|
||||
pass
|
||||
# Test behaviour with one safekeeper down and missing a lot of WAL, exercising
|
||||
# neon_walreader and checking that pg_wal never bloats. Namely, ensures that
|
||||
# compute doesn't keep many WAL for lagging sk, but still can recover it with
|
||||
# neon_walreader, in two scenarious: a) WAL never existed on compute (it started
|
||||
# on basebackup LSN later than lagging sk position) though segment file exists
|
||||
# b) WAL had been recycled on it and segment file doesn't exist.
|
||||
#
|
||||
# Also checks along the way that whenever there are two sks alive, compute
|
||||
# should be able to commit.
|
||||
def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
|
||||
# inserts ~20MB of WAL, a bit more than a segment.
|
||||
def fill_segment(ep):
|
||||
ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'")
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
(sk1, sk2, sk3) = env.safekeepers
|
||||
|
||||
# create and insert smth while safekeeper is down...
|
||||
sk1.stop()
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_lagging_sk")
|
||||
ep = env.endpoints.create_start("test_lagging_sk")
|
||||
ep.safe_psql("create table t(key int, value text)")
|
||||
# make small insert to be on the same segment
|
||||
ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
|
||||
log.info("insert with safekeeper down done")
|
||||
ep.stop() # stop compute
|
||||
|
||||
# Stop another safekeeper, and start one which missed timeline creation.
|
||||
sk2.stop()
|
||||
sk1.start()
|
||||
|
||||
# Start new ep and insert some more. neon_walreader should download WAL for
|
||||
# sk1 because it should be filled since the horizon (initial LSN) which is
|
||||
# earlier than basebackup LSN.
|
||||
ep = env.endpoints.create_start("test_lagging_sk")
|
||||
ep.safe_psql("insert into t select generate_series(1,100), 'payload'")
|
||||
# stop ep and ensure WAL is identical after recovery.
|
||||
wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
|
||||
# Check that WALs are the same.
|
||||
cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
|
||||
|
||||
# Now repeat insertion with sk1 down, but with inserting more data to check
|
||||
# that WAL on compute is removed.
|
||||
sk1.stop()
|
||||
sk2.start()
|
||||
|
||||
# min_wal_size must be at least 2x segment size.
|
||||
min_wal_config = [
|
||||
"min_wal_size=32MB",
|
||||
"max_wal_size=32MB",
|
||||
"wal_keep_size=0",
|
||||
"log_checkpoints=on",
|
||||
]
|
||||
ep = env.endpoints.create_start(
|
||||
"test_lagging_sk",
|
||||
config_lines=min_wal_config,
|
||||
)
|
||||
with WalChangeLogger(ep, "doing large insert with sk1 down"):
|
||||
for _ in range(0, 5):
|
||||
fill_segment(ep)
|
||||
# there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
|
||||
assert ep.get_pg_wal_size() < 16 * 2.5
|
||||
|
||||
sk2.stop() # stop another sk to ensure sk1 and sk3 can work
|
||||
sk1.start()
|
||||
with DurationLogger("recovery"):
|
||||
ep.safe_psql("insert into t select generate_series(1,100), 'payload'") # forces recovery
|
||||
# stop ep and ensure WAL is identical after recovery.
|
||||
wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
|
||||
# Check that WALs are the same.
|
||||
cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
|
||||
|
||||
# Now do the same with different safekeeper sk2 down, and restarting ep
|
||||
# before recovery (again scenario when recovery starts below basebackup_lsn,
|
||||
# but multi segment now).
|
||||
ep = env.endpoints.create_start(
|
||||
"test_lagging_sk",
|
||||
config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
|
||||
)
|
||||
with WalChangeLogger(ep, "doing large insert with sk2 down"):
|
||||
for _ in range(0, 5):
|
||||
fill_segment(ep)
|
||||
# there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
|
||||
assert ep.get_pg_wal_size() < 16 * 2.5
|
||||
|
||||
ep.stop()
|
||||
ep = env.endpoints.create_start(
|
||||
"test_lagging_sk",
|
||||
config_lines=min_wal_config,
|
||||
)
|
||||
sk2.start()
|
||||
with DurationLogger("recovery"):
|
||||
wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3])
|
||||
# Check that WALs are the same.
|
||||
cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
|
||||
|
||||
|
||||
# Smaller version of test_one_sk_down testing peer recovery in isolation: that
|
||||
@@ -1065,7 +1284,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
sk2_http_cli = sk2.http_client()
|
||||
# ensure tli gets created on sk1, peer recovery won't do that
|
||||
wait(
|
||||
partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
|
||||
partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
|
||||
"flush_lsn to get aligned",
|
||||
)
|
||||
|
||||
@@ -1087,7 +1306,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
|
||||
|
||||
# wait a bit, lsns shouldn't change
|
||||
# time.sleep(5)
|
||||
time.sleep(2)
|
||||
sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
log.info(
|
||||
@@ -1098,37 +1317,11 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
# now restart safekeeper with peer recovery enabled and wait for recovery
|
||||
sk1.stop().start(extra_opts=["--peer-recovery=true"])
|
||||
wait(
|
||||
partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
|
||||
partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
|
||||
"flush_lsn to get aligned",
|
||||
)
|
||||
|
||||
# check that WALs are identic after recovery
|
||||
segs = sk1.list_segments(tenant_id, timeline_id)
|
||||
log.info(f"segs are {segs}")
|
||||
|
||||
(_, mismatch, not_regular) = filecmp.cmpfiles(
|
||||
sk1.timeline_dir(tenant_id, timeline_id),
|
||||
sk2.timeline_dir(tenant_id, timeline_id),
|
||||
segs,
|
||||
shallow=False,
|
||||
)
|
||||
log.info(
|
||||
f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
|
||||
)
|
||||
|
||||
for f in mismatch:
|
||||
f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
|
||||
f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
|
||||
stdout_filename = "{}.filediff".format(f2)
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
|
||||
subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
|
||||
|
||||
cmd = "diff {}.hex {}.hex".format(f1, f2)
|
||||
subprocess.run([cmd], stdout=stdout_f, shell=True)
|
||||
|
||||
assert (mismatch, not_regular) == ([], [])
|
||||
cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
|
||||
|
||||
# stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
|
||||
env.safekeepers[2].stop()
|
||||
@@ -1364,60 +1557,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
|
||||
show_statuses(env.safekeepers, tenant_id, timeline_id)
|
||||
|
||||
|
||||
# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted
|
||||
# to all safekeepers. This test checks that compute WAL can fit into small number
|
||||
# of WAL segments.
|
||||
def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder):
|
||||
# used to calculate delta in collect_stats
|
||||
last_lsn = Lsn(0)
|
||||
|
||||
# returns pg_wal size in MB
|
||||
def collect_stats(endpoint: Endpoint, cur, enable_logs=True):
|
||||
nonlocal last_lsn
|
||||
assert endpoint.pgdata_dir is not None
|
||||
|
||||
log.info("executing INSERT to generate WAL")
|
||||
current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024
|
||||
if enable_logs:
|
||||
lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024
|
||||
log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB")
|
||||
last_lsn = current_lsn
|
||||
return pg_wal_size_mb
|
||||
|
||||
# generates about ~20MB of WAL, to create at least one new segment
|
||||
def generate_wal(cur):
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'")
|
||||
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.neon_cli.create_branch("test_wal_deleted_after_broadcast")
|
||||
# Adjust checkpoint config to prevent keeping old WAL segments
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_wal_deleted_after_broadcast",
|
||||
config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
|
||||
)
|
||||
|
||||
pg_conn = endpoint.connect()
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute("CREATE TABLE t(key int, value text)")
|
||||
|
||||
collect_stats(endpoint, cur)
|
||||
|
||||
# generate WAL to simulate normal workload
|
||||
for _ in range(5):
|
||||
generate_wal(cur)
|
||||
collect_stats(endpoint, cur)
|
||||
|
||||
log.info("executing checkpoint")
|
||||
cur.execute("CHECKPOINT")
|
||||
wal_size_after_checkpoint = collect_stats(endpoint, cur)
|
||||
|
||||
# there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
|
||||
assert wal_size_after_checkpoint < 16 * 2.5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("auth_enabled", [False, True])
|
||||
def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
neon_env_builder.auth_enabled = auth_enabled
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 0bb356aa0c...03358bb0b5
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 24333abb81...a2dc225ddf
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 863b71572b...225071f482
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb",
|
||||
"postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7",
|
||||
"postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d"
|
||||
"postgres-v16": "225071f482774943854c2eec4540757e01171557",
|
||||
"postgres-v15": "a2dc225ddfc8cae1849aa2316f435c58f0333d8c",
|
||||
"postgres-v14": "03358bb0b5e0d33c238710139e768db9e75cfcc8"
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user