mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-19 03:12:55 +00:00
Compare commits
234 Commits
http2
...
release-44
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
@@ -1,2 +0,0 @@
|
||||
[profile.default]
|
||||
slow-timeout = "1m"
|
||||
29
.github/workflows/benchmarking.yml
vendored
29
.github/workflows/benchmarking.yml
vendored
@@ -11,7 +11,7 @@ on:
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
inputs:
|
||||
@@ -23,21 +23,6 @@ on:
|
||||
type: boolean
|
||||
description: 'Publish perf report. If not set, the report will be published only for the main branch'
|
||||
required: false
|
||||
collect_olap_explain:
|
||||
type: boolean
|
||||
description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected'
|
||||
required: false
|
||||
default: false
|
||||
collect_pg_stat_statements:
|
||||
type: boolean
|
||||
description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected'
|
||||
required: false
|
||||
default: false
|
||||
run_AWS_RDS_AND_AURORA:
|
||||
type: boolean
|
||||
description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
|
||||
required: false
|
||||
default: false
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -128,8 +113,6 @@ jobs:
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
@@ -169,7 +152,7 @@ jobs:
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
|
||||
{ "platform": "rds-aurora" }]')
|
||||
fi
|
||||
@@ -188,9 +171,9 @@ jobs:
|
||||
]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
{ "platform": "rds-aurora", "scale": "10" }]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -354,8 +337,6 @@ jobs:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
@@ -418,8 +399,6 @@ jobs:
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
TEST_OLAP_SCALE: 10
|
||||
|
||||
|
||||
105
.github/workflows/build_and_push_docker_image.yml
vendored
105
.github/workflows/build_and_push_docker_image.yml
vendored
@@ -1,105 +0,0 @@
|
||||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dockerfile-path:
|
||||
required: true
|
||||
type: string
|
||||
image-name:
|
||||
required: true
|
||||
type: string
|
||||
outputs:
|
||||
build-tools-tag:
|
||||
description: "tag generated for build tools"
|
||||
value: ${{ jobs.tag.outputs.build-tools-tag }}
|
||||
|
||||
jobs:
|
||||
check-if-build-tools-dockerfile-changed:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
|
||||
steps:
|
||||
- name: Check if Dockerfile.buildtools has changed
|
||||
id: dockerfile
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
|
||||
echo "docker_file_changed=false" >> $GITHUB_OUTPUT
|
||||
exit
|
||||
fi
|
||||
updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
|
||||
if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
|
||||
echo "docker_file_changed=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
tag:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [ check-if-build-tools-dockerfile-changed ]
|
||||
outputs:
|
||||
build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
|
||||
|
||||
steps:
|
||||
- name: Get buildtools tag
|
||||
env:
|
||||
DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
|
||||
run: |
|
||||
if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
|
||||
IMAGE_TAG=$GITHUB_RUN_ID
|
||||
else
|
||||
IMAGE_TAG=pinned
|
||||
fi
|
||||
|
||||
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
|
||||
shell: bash
|
||||
id: buildtools-tag
|
||||
|
||||
kaniko:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
|
||||
|
||||
kaniko-arm:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
needs: [ tag, check-if-build-tools-dockerfile-changed ]
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.7.0-debug
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build
|
||||
run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
manifest:
|
||||
if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
|
||||
name: 'manifest'
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs:
|
||||
- tag
|
||||
- kaniko
|
||||
- kaniko-arm
|
||||
- check-if-build-tools-dockerfile-changed
|
||||
|
||||
steps:
|
||||
- name: Create manifest
|
||||
run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
|
||||
|
||||
- name: Push manifest
|
||||
run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
|
||||
67
.github/workflows/build_and_test.yml
vendored
67
.github/workflows/build_and_test.yml
vendored
@@ -44,6 +44,7 @@ jobs:
|
||||
|
||||
exit 1
|
||||
|
||||
|
||||
tag:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
@@ -73,19 +74,11 @@ jobs:
|
||||
shell: bash
|
||||
id: build-tag
|
||||
|
||||
build-buildtools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/build_and_push_docker_image.yml
|
||||
with:
|
||||
dockerfile-path: Dockerfile.buildtools
|
||||
image-name: build-tools
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -105,20 +98,20 @@ jobs:
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run `ruff check` to ensure code format
|
||||
run: poetry run ruff check .
|
||||
- name: Run ruff to ensure code format
|
||||
run: poetry run ruff .
|
||||
|
||||
- name: Run `ruff format` to ensure code format
|
||||
run: poetry run ruff format --check .
|
||||
- name: Run black to ensure code format
|
||||
run: poetry run black --diff --check .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
needs: [ check-permissions ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -182,10 +175,10 @@ jobs:
|
||||
run: cargo deny check --hide-inclusion-graph
|
||||
|
||||
build-neon:
|
||||
needs: [ check-permissions, tag, build-buildtools-image ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -339,16 +332,16 @@ jobs:
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run rust tests
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
@@ -358,7 +351,7 @@ jobs:
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
@@ -415,10 +408,10 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, build-neon, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
strategy:
|
||||
@@ -454,10 +447,10 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
benchmarks:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image ]
|
||||
needs: [ check-permissions, build-neon ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
# Default shared memory is 64mb
|
||||
options: --init --shm-size=512mb
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
@@ -486,12 +479,12 @@ jobs:
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
@@ -533,10 +526,11 @@ jobs:
|
||||
})
|
||||
|
||||
coverage-report:
|
||||
needs: [ check-permissions, regress-tests, build-buildtools-image ]
|
||||
needs: [ check-permissions, regress-tests ]
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -700,7 +694,7 @@ jobs:
|
||||
}"
|
||||
|
||||
neon-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
@@ -739,7 +733,6 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
@@ -750,7 +743,7 @@ jobs:
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
@@ -785,7 +778,6 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
@@ -796,7 +788,7 @@ jobs:
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
needs: [ check-permissions, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
@@ -844,7 +836,6 @@ jobs:
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
@@ -866,7 +857,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.21.0
|
||||
VM_BUILDER_VERSION: v0.19.0
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -218,7 +218,7 @@ jobs:
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
130
.github/workflows/update_build_tools_image.yml
vendored
130
.github/workflows/update_build_tools_image.yml
vendored
@@ -1,130 +0,0 @@
|
||||
name: 'Update build tools image tag'
|
||||
|
||||
# This workflow it used to update tag of build tools in ECR.
|
||||
# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
to-tag:
|
||||
description: 'Destination tag'
|
||||
required: true
|
||||
type: string
|
||||
default: 'pinned'
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
tag-image:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
outputs:
|
||||
next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
|
||||
prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Get source image digest
|
||||
id: next-digest
|
||||
run: |
|
||||
NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
|
||||
echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Get destination image digest (if already exists)
|
||||
id: prev-digest
|
||||
run: |
|
||||
PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
|
||||
else
|
||||
echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
|
||||
|
||||
echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Tag image
|
||||
run: |
|
||||
crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
|
||||
|
||||
rollback-tag-image:
|
||||
needs: tag-image
|
||||
if: ${{ !success() }}
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
env:
|
||||
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: ${{ inputs.to-tag }}
|
||||
|
||||
steps:
|
||||
- name: Install Crane & ECR helper
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Restore previous tag if needed
|
||||
run: |
|
||||
NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
|
||||
PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
|
||||
|
||||
if [ -z "${NEXT_DIGEST}" ]; then
|
||||
echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ -z "${PREV_DIGEST}" ]; then
|
||||
# I guess we should delete the tag here/untag the image, but crane does not support it
|
||||
# - https://github.com/google/go-containerregistry/issues/999
|
||||
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
|
||||
if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
|
||||
crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
|
||||
|
||||
echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
|
||||
else
|
||||
echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
|
||||
fi
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -6,7 +6,6 @@ __pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
.idea
|
||||
neon.iml
|
||||
/.neon
|
||||
/integration_tests/.neon
|
||||
|
||||
|
||||
@@ -70,17 +70,3 @@ We're using the following approach to make it work:
|
||||
- The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
|
||||
|
||||
For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
|
||||
|
||||
## How do I add the "pinned" tag to an buildtools image?
|
||||
We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
|
||||
|
||||
You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
|
||||
or using GitHub CLI:
|
||||
|
||||
```bash
|
||||
gh workflow -R neondatabase/neon run update_build_tools_image.yml \
|
||||
-f from-tag=6254913013 \
|
||||
-f to-tag=pinned \
|
||||
|
||||
# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
|
||||
```
|
||||
378
Cargo.lock
generated
378
Cargo.lock
generated
@@ -30,8 +30,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"const-random",
|
||||
"getrandom 0.2.11",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
"zerocopy",
|
||||
@@ -52,12 +50,6 @@ version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
||||
|
||||
[[package]]
|
||||
name = "android-tzdata"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
@@ -198,9 +190,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "async-compression"
|
||||
version = "0.4.5"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
|
||||
checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
|
||||
dependencies = [
|
||||
"flate2",
|
||||
"futures-core",
|
||||
@@ -255,12 +247,6 @@ dependencies = [
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomic"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
|
||||
|
||||
[[package]]
|
||||
name = "atomic-polyfill"
|
||||
version = "1.0.2"
|
||||
@@ -1025,17 +1011,17 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.31"
|
||||
version = "0.4.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
|
||||
checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
"js-sys",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
"windows-targets 0.48.0",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1134,20 +1120,6 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
|
||||
|
||||
[[package]]
|
||||
name = "combine"
|
||||
version = "4.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "comfy-table"
|
||||
version = "6.1.4"
|
||||
@@ -1189,7 +1161,6 @@ dependencies = [
|
||||
"flate2",
|
||||
"futures",
|
||||
"hyper",
|
||||
"nix 0.26.2",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
"opentelemetry",
|
||||
@@ -1197,10 +1168,8 @@ dependencies = [
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rust-ini",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -1232,26 +1201,6 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
|
||||
|
||||
[[package]]
|
||||
name = "const-random"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
|
||||
dependencies = [
|
||||
"const-random-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const-random-macro"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
|
||||
dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
"once_cell",
|
||||
"tiny-keccak",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "const_fn"
|
||||
version = "0.4.9"
|
||||
@@ -1484,12 +1433,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crunchy"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
|
||||
|
||||
[[package]]
|
||||
name = "crypto-bigint"
|
||||
version = "0.4.9"
|
||||
@@ -1632,15 +1575,6 @@ dependencies = [
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dlv-list"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
|
||||
dependencies = [
|
||||
"const-random",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dyn-clone"
|
||||
version = "1.0.14"
|
||||
@@ -2172,20 +2106,6 @@ dependencies = [
|
||||
"hashbrown 0.13.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hdrhistogram"
|
||||
version = "7.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"byteorder",
|
||||
"crossbeam-channel",
|
||||
"flate2",
|
||||
"nom",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.8.0"
|
||||
@@ -2389,6 +2309,19 @@ dependencies = [
|
||||
"tokio-native-tls",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tungstenite"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
|
||||
dependencies = [
|
||||
"hyper",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tokio-tungstenite",
|
||||
"tungstenite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.56"
|
||||
@@ -2490,12 +2423,6 @@ dependencies = [
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "integer-encoding"
|
||||
version = "3.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.11"
|
||||
@@ -2560,14 +2487,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "jsonwebtoken"
|
||||
version = "9.2.0"
|
||||
version = "8.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
|
||||
checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"js-sys",
|
||||
"pem 3.0.3",
|
||||
"ring 0.17.6",
|
||||
"pem 1.1.1",
|
||||
"ring 0.16.20",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"simple_asn1",
|
||||
@@ -2859,19 +2785,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
|
||||
dependencies = [
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-iter",
|
||||
"num-rational",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.3"
|
||||
@@ -2883,15 +2796,6 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.45"
|
||||
@@ -2902,28 +2806,6 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-iter"
|
||||
version = "0.1.43"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-rational"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.15"
|
||||
@@ -3146,25 +3028,6 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-float"
|
||||
version = "2.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ordered-multimap"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
|
||||
dependencies = [
|
||||
"dlv-list",
|
||||
"hashbrown 0.14.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "os_info"
|
||||
version = "3.7.0"
|
||||
@@ -3193,30 +3056,6 @@ dependencies = [
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"camino",
|
||||
"clap",
|
||||
"futures",
|
||||
"hdrhistogram",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagectl"
|
||||
version = "0.1.0"
|
||||
@@ -3306,7 +3145,6 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
@@ -3344,19 +3182,11 @@ dependencies = [
|
||||
name = "pageserver_client"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -3415,35 +3245,6 @@ dependencies = [
|
||||
"windows-targets 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parquet"
|
||||
version = "49.0.0"
|
||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"hashbrown 0.14.0",
|
||||
"num",
|
||||
"num-bigint",
|
||||
"paste",
|
||||
"seq-macro",
|
||||
"thrift",
|
||||
"twox-hash",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parquet_derive"
|
||||
version = "49.0.0"
|
||||
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
|
||||
dependencies = [
|
||||
"parquet",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "password-hash"
|
||||
version = "0.5.0"
|
||||
@@ -3481,19 +3282,18 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "2.0.1"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
|
||||
checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"serde",
|
||||
"base64 0.13.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.3"
|
||||
version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
|
||||
checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"serde",
|
||||
@@ -3867,8 +3667,6 @@ dependencies = [
|
||||
"base64 0.13.1",
|
||||
"bstr",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"chrono",
|
||||
"clap",
|
||||
"consumption_metrics",
|
||||
@@ -3882,6 +3680,7 @@ dependencies = [
|
||||
"hostname",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"hyper-tungstenite",
|
||||
"ipnet",
|
||||
"itertools",
|
||||
"md5",
|
||||
@@ -3890,8 +3689,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
"parking_lot 0.12.1",
|
||||
"parquet",
|
||||
"parquet_derive",
|
||||
"pbkdf2",
|
||||
"pin-project-lite",
|
||||
"postgres-native-tls",
|
||||
@@ -3901,9 +3698,7 @@ dependencies = [
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rcgen",
|
||||
"redis",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"reqwest-middleware",
|
||||
"reqwest-retry",
|
||||
@@ -3927,13 +3722,11 @@ dependencies = [
|
||||
"tokio-postgres",
|
||||
"tokio-postgres-rustls",
|
||||
"tokio-rustls",
|
||||
"tokio-tungstenite",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"tungstenite",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
@@ -4066,32 +3859,6 @@ dependencies = [
|
||||
"yasna",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redis"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"combine",
|
||||
"futures-util",
|
||||
"itoa",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls",
|
||||
"rustls-native-certs",
|
||||
"rustls-pemfile",
|
||||
"rustls-webpki 0.101.7",
|
||||
"ryu",
|
||||
"sha1_smol",
|
||||
"socket2 0.4.9",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.16"
|
||||
@@ -4402,16 +4169,6 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-ini"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"ordered-multimap",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-demangle"
|
||||
version = "0.1.23"
|
||||
@@ -4543,14 +4300,12 @@ dependencies = [
|
||||
"async-stream",
|
||||
"aws-config",
|
||||
"aws-sdk-s3",
|
||||
"aws-smithy-async",
|
||||
"bincode",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"hex",
|
||||
"histogram",
|
||||
@@ -4589,7 +4344,6 @@ dependencies = [
|
||||
"clap",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"fail",
|
||||
"fs2",
|
||||
"futures",
|
||||
"git-version",
|
||||
@@ -4613,7 +4367,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"sha2",
|
||||
"signal-hook",
|
||||
"storage_broker",
|
||||
"thiserror",
|
||||
@@ -4666,12 +4419,12 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
||||
|
||||
[[package]]
|
||||
name = "sct"
|
||||
version = "0.7.1"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
|
||||
checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4"
|
||||
dependencies = [
|
||||
"ring 0.17.6",
|
||||
"untrusted 0.9.0",
|
||||
"ring 0.16.20",
|
||||
"untrusted 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4820,12 +4573,6 @@ dependencies = [
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "seq-macro"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.183"
|
||||
@@ -4948,12 +4695,6 @@ dependencies = [
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha1_smol"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.10.6"
|
||||
@@ -5352,17 +5093,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thrift"
|
||||
version = "0.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"integer-encoding",
|
||||
"ordered-float",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.21"
|
||||
@@ -5393,15 +5123,6 @@ dependencies = [
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tiny-keccak"
|
||||
version = "2.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
|
||||
dependencies = [
|
||||
"crunchy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.2.1"
|
||||
@@ -5907,16 +5628,6 @@ dependencies = [
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"static_assertions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.16.0"
|
||||
@@ -6055,7 +5766,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"const_format",
|
||||
"criterion",
|
||||
"fail",
|
||||
"futures",
|
||||
"heapless",
|
||||
"hex",
|
||||
@@ -6094,11 +5804,10 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
version = "1.6.1"
|
||||
version = "1.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
|
||||
checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
|
||||
dependencies = [
|
||||
"atomic",
|
||||
"getrandom 0.2.11",
|
||||
"serde",
|
||||
]
|
||||
@@ -6581,7 +6290,6 @@ dependencies = [
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hex",
|
||||
"hmac",
|
||||
"hyper",
|
||||
@@ -6593,8 +6301,6 @@ dependencies = [
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"parquet",
|
||||
"prost",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
@@ -6697,28 +6403,30 @@ checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.13.0"
|
||||
version = "0.12.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
|
||||
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "7.0.0"
|
||||
version = "6.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e"
|
||||
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.9+zstd.1.5.5"
|
||||
version = "2.0.8+zstd.1.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656"
|
||||
checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
16
Cargo.toml
16
Cargo.toml
@@ -6,7 +6,6 @@ members = [
|
||||
"pageserver",
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/pagebench",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
@@ -80,7 +79,6 @@ futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
@@ -89,10 +87,11 @@ http-types = { version = "2", default-features = false }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.11"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
jsonwebtoken = "8"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
@@ -106,14 +105,11 @@ opentelemetry = "0.19.0"
|
||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.11.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "49.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||
@@ -155,7 +151,6 @@ tokio-rustls = "0.24"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
tokio-tungstenite = "0.20"
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
@@ -163,9 +158,8 @@ tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.19.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
tungstenite = "0.20"
|
||||
url = "2.2"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
webpki-roots = "0.25"
|
||||
x509-parser = "0.15"
|
||||
@@ -219,10 +213,6 @@ tonic-build = "0.9"
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
# bug fixes for UUID
|
||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
[profile.release]
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
|
||||
### inside this image in the real deployments.
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
|
||||
# Build Postgres
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# System deps
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install -y \
|
||||
autoconf \
|
||||
automake \
|
||||
bison \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
curl \
|
||||
flex \
|
||||
git \
|
||||
gnupg \
|
||||
gzip \
|
||||
jq \
|
||||
libcurl4-openssl-dev \
|
||||
libbz2-dev \
|
||||
libffi-dev \
|
||||
liblzma-dev \
|
||||
libncurses5-dev \
|
||||
libncursesw5-dev \
|
||||
libpq-dev \
|
||||
libreadline-dev \
|
||||
libseccomp-dev \
|
||||
libsqlite3-dev \
|
||||
libssl-dev \
|
||||
libstdc++-10-dev \
|
||||
libtool \
|
||||
libxml2-dev \
|
||||
libxmlsec1-dev \
|
||||
libxxhash-dev \
|
||||
lsof \
|
||||
make \
|
||||
netcat \
|
||||
net-tools \
|
||||
openssh-client \
|
||||
parallel \
|
||||
pkg-config \
|
||||
unzip \
|
||||
wget \
|
||||
xz-utils \
|
||||
zlib1g-dev \
|
||||
zstd \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION 25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
&& unzip -q protoc.zip -d protoc \
|
||||
&& mv protoc/bin/protoc /usr/local/bin/protoc \
|
||||
&& mv protoc/include/google /usr/local/include/google \
|
||||
&& rm -rf protoc.zip protoc
|
||||
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=17
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# PostgreSQL 14
|
||||
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
|
||||
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
|
||||
&& apt update \
|
||||
&& apt install -y postgresql-client-14 \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# AWS CLI
|
||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
||||
&& unzip -q awscliv2.zip \
|
||||
&& ./aws/install \
|
||||
&& rm awscliv2.zip
|
||||
|
||||
# Mold: A Modern Linker
|
||||
ENV MOLD_VERSION v2.4.0
|
||||
RUN set -e \
|
||||
&& git clone https://github.com/rui314/mold.git \
|
||||
&& mkdir mold/build \
|
||||
&& cd mold/build \
|
||||
&& git checkout ${MOLD_VERSION} \
|
||||
&& cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
|
||||
&& cmake --build . -j $(nproc) \
|
||||
&& cmake --install . \
|
||||
&& cd .. \
|
||||
&& rm -rf mold
|
||||
|
||||
# LCOV
|
||||
# Build lcov from a fork:
|
||||
# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
|
||||
# And patches from us:
|
||||
# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
|
||||
RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
|
||||
&& wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
|
||||
&& echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992 lcov.tar.gz" | sha256sum --check \
|
||||
&& mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
|
||||
&& cd lcov \
|
||||
&& make install \
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Switch to nonroot user
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Python
|
||||
ENV PYTHON_VERSION=3.9.2 \
|
||||
PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
|
||||
RUN set -e \
|
||||
&& cd $HOME \
|
||||
&& curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
|
||||
&& chmod +x pyenv-installer \
|
||||
&& ./pyenv-installer \
|
||||
&& export PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
&& export PATH="$PYENV_ROOT/bin:$PATH" \
|
||||
&& export PATH="$PYENV_ROOT/shims:$PATH" \
|
||||
&& pyenv install ${PYTHON_VERSION} \
|
||||
&& pyenv global ${PYTHON_VERSION} \
|
||||
&& python --version \
|
||||
&& pip install --upgrade pip \
|
||||
&& pip --version \
|
||||
&& pip install pipenv wheel poetry
|
||||
|
||||
# Switch to nonroot user (again)
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.75.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
|
||||
rm rustup-init && \
|
||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
ENV RUSTC_WRAPPER=cachepot
|
||||
|
||||
# Show versions
|
||||
RUN whoami \
|
||||
&& python --version \
|
||||
&& pip --version \
|
||||
&& cargo --version --verbose \
|
||||
&& rustup --version --verbose \
|
||||
&& rustc --version --verbose \
|
||||
&& clang --version
|
||||
@@ -1,6 +1,6 @@
|
||||
ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
@@ -48,29 +48,7 @@ RUN cd postgres && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
|
||||
# We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
|
||||
# In vanilla postgres this function is limited to Postgres role superuser.
|
||||
# In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
|
||||
# We could add the additional grant statements to the postgres repository but it would be hard to maintain,
|
||||
# whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
|
||||
# so we do it here.
|
||||
old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
|
||||
# the first loop is for pg_stat_statement extension version <= 1.6
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done; \
|
||||
# the second loop is for pg_stat_statement extension versions >= 1.7,
|
||||
# where pg_stat_statement_reset() got 3 additional arguments
|
||||
for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
|
||||
filename=$(basename "$file"); \
|
||||
if ! echo "$old_list" | grep -q -F "$filename"; then \
|
||||
echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
|
||||
fi; \
|
||||
done
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -591,23 +569,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-semver-pg-build"
|
||||
# compile pg_semver extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-semver-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
|
||||
echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-embedding-pg-build"
|
||||
@@ -807,7 +768,6 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
|
||||
@@ -29,14 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
|
||||
```bash
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
|
||||
libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
|
||||
libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
|
||||
```
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \
|
||||
libffi-devel
|
||||
protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
|
||||
@@ -13,7 +13,6 @@ clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
@@ -21,7 +20,6 @@ postgres.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
tar.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
@@ -39,6 +37,5 @@ workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.13"
|
||||
zstd = "0.12.4"
|
||||
bytes = "1.0"
|
||||
rust-ini = "0.20.0"
|
||||
|
||||
@@ -31,31 +31,25 @@
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
|
||||
//! --pgbouncer-ini-path /etc/pgbouncer.ini \
|
||||
//! -r http://pg-ext-s3-gateway
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info};
|
||||
use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version;
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
@@ -71,13 +65,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||
thread::spawn(move || {
|
||||
for sig in signals.forever() {
|
||||
handle_exit_signal(sig);
|
||||
}
|
||||
});
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG")
|
||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||
.to_string();
|
||||
@@ -112,9 +99,6 @@ fn main() -> Result<()> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
|
||||
let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
|
||||
let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
|
||||
|
||||
// Extract OpenTelemetry context for the startup actions from the
|
||||
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
|
||||
// tracing context.
|
||||
@@ -225,8 +209,6 @@ fn main() -> Result<()> {
|
||||
ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
build_tag,
|
||||
pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
|
||||
pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -350,20 +332,13 @@ fn main() -> Result<()> {
|
||||
|
||||
// Wait for the child Postgres process forever. In this state Ctrl+C will
|
||||
// propagate to Postgres and it will be shut down as well.
|
||||
if let Some((mut pg, logs_handle)) = pg {
|
||||
if let Some(mut pg) = pg {
|
||||
// Startup is finished, exit the startup tracing span
|
||||
drop(startup_context_guard);
|
||||
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
PG_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
// Process has exited, so we can join the logs thread.
|
||||
let _ = logs_handle
|
||||
.join()
|
||||
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
|
||||
|
||||
info!("Postgres exited with code {}, shutting down", ecode);
|
||||
exit_code = ecode.code()
|
||||
}
|
||||
@@ -518,41 +493,6 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-connstr")
|
||||
.long("pgbouncer-connstr")
|
||||
.default_value(
|
||||
"host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
|
||||
)
|
||||
.value_name("PGBOUNCER_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbouncer-ini-path")
|
||||
.long("pgbouncer-ini-path")
|
||||
// Note: this doesn't match current path for pgbouncer.ini.
|
||||
// Until we fix it, we need to pass the path explicitly
|
||||
// or this will be effectively no-op.
|
||||
.default_value("/etc/pgbouncer.ini")
|
||||
.value_name("PGBOUNCER_INI_PATH"),
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
|
||||
/// wait for termination which would be easy then.
|
||||
fn handle_exit_signal(sig: i32) {
|
||||
info!("received {sig} termination signal");
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
kill(pg_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -6,10 +6,7 @@ use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -31,15 +28,11 @@ use utils::measured_stream::MeasuredReader;
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
|
||||
use crate::checker::create_availability_check_data;
|
||||
use crate::logger::inlinify;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
use crate::{config, extension_server};
|
||||
|
||||
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
|
||||
pub static PG_PID: AtomicU32 = AtomicU32::new(0);
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
// Url type maintains proper escaping
|
||||
@@ -71,10 +64,6 @@ pub struct ComputeNode {
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
// connection string to pgbouncer to change settings
|
||||
pub pgbouncer_connstr: Option<String>,
|
||||
// path to pgbouncer.ini to change settings
|
||||
pub pgbouncer_ini_path: Option<String>,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
@@ -280,7 +269,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
$$;"#,
|
||||
roles_decl, database_decl,
|
||||
);
|
||||
info!("Neon superuser created:\n{}", inlinify(&query));
|
||||
info!("Neon superuser created:\n{}", &query);
|
||||
client
|
||||
.simple_query(&query)
|
||||
.map_err(|e| anyhow::anyhow!(e).context(query))?;
|
||||
@@ -496,7 +485,7 @@ impl ComputeNode {
|
||||
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut sync_handle = maybe_cgexec(&self.pgbin)
|
||||
let sync_handle = maybe_cgexec(&self.pgbin)
|
||||
.args(["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.envs(if let Some(storage_auth_token) = &storage_auth_token {
|
||||
@@ -505,29 +494,15 @@ impl ComputeNode {
|
||||
vec![]
|
||||
})
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we leave stdout to collect LSN, while stderr logs
|
||||
// will be collected in a child thread.
|
||||
let stderr = sync_handle
|
||||
.stderr
|
||||
.take()
|
||||
.expect("stderr should be captured");
|
||||
let logs_handle = handle_postgres_logs(stderr);
|
||||
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
// redirected to the caller output.
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
|
||||
|
||||
// Process has exited, so we can join the logs thread.
|
||||
let _ = logs_handle
|
||||
.join()
|
||||
.map_err(|e| tracing::error!("log thread panicked: {:?}", e));
|
||||
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
@@ -665,12 +640,11 @@ impl ComputeNode {
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
/// Returns a handle to the child process and a handle to the logs thread.
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_postgres(
|
||||
&self,
|
||||
storage_auth_token: Option<String>,
|
||||
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
||||
) -> Result<std::process::Child> {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
@@ -681,18 +655,12 @@ impl ComputeNode {
|
||||
} else {
|
||||
vec![]
|
||||
})
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
PG_PID.store(pg.id(), Ordering::SeqCst);
|
||||
|
||||
// Start a thread to collect logs from stderr.
|
||||
let stderr = pg.stderr.take().expect("stderr should be captured");
|
||||
let logs_handle = handle_postgres_logs(stderr);
|
||||
|
||||
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||
|
||||
Ok((pg, logs_handle))
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
/// Do initial configuration of the already started Postgres.
|
||||
@@ -769,31 +737,6 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
@@ -837,10 +780,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn start_compute(
|
||||
&self,
|
||||
extension_server_port: u16,
|
||||
) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
|
||||
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
@@ -851,32 +791,6 @@ impl ComputeNode {
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
// tune pgbouncer
|
||||
if let Some(connstr) = &self.pgbouncer_connstr {
|
||||
info!("tuning pgbouncer with connstr: {:?}", connstr);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
|
||||
// Spawn a thread to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
|
||||
let connstr_clone = connstr.clone();
|
||||
let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
let res = rt.block_on(tune_pgbouncer(
|
||||
pgbouncer_settings,
|
||||
&connstr_clone,
|
||||
pgbouncer_ini_path,
|
||||
));
|
||||
if let Err(err) = res {
|
||||
error!("error while tuning pgbouncer: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
info!(
|
||||
"start_compute spec.remote_extensions {:?}",
|
||||
pspec.spec.remote_extensions
|
||||
@@ -911,7 +825,7 @@ impl ComputeNode {
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
|
||||
let config_time = Utc::now();
|
||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||
@@ -961,7 +875,7 @@ impl ComputeNode {
|
||||
};
|
||||
info!(?metrics, "compute start finished");
|
||||
|
||||
Ok(pg_process)
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
// Look for core dumps and collect backtraces.
|
||||
|
||||
@@ -38,9 +38,3 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Replace all newline characters with a special character to make it
|
||||
/// easier to grep for log messages.
|
||||
pub fn inlinify(s: &str) -> String {
|
||||
s.replace('\n', "\u{200B}")
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{thread, time::Duration};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, info, warn};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
@@ -84,29 +84,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
}
|
||||
|
||||
// If there are existing (logical) walsenders, do not suspend.
|
||||
//
|
||||
// walproposer doesn't currently show up in pg_stat_replication,
|
||||
// but protect if it will be
|
||||
let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
|
||||
match cli.query_one(ws_count_query, &[]) {
|
||||
Ok(r) => match r.try_get::<&str, i64>("count") {
|
||||
Ok(num_ws) => {
|
||||
if num_ws > 0 {
|
||||
last_active = Some(Utc::now());
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to parse ws count: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("failed to get list of walsenders: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Update the last activity in the shared state if we got a more recent one.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
// NB: `Some(<DateTime>)` is always greater than `None`.
|
||||
|
||||
@@ -6,17 +6,12 @@ use std::io::{BufRead, BufReader};
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::Child;
|
||||
use std::thread::JoinHandle;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use ini::Ini;
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
use postgres::{Client, Transaction};
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::time::timeout;
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::{debug, error, info, instrument};
|
||||
use tracing::{debug, instrument};
|
||||
|
||||
use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
@@ -364,137 +359,3 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update pgbouncer.ini with provided options
|
||||
pub fn update_pgbouncer_ini(
|
||||
pgbouncer_config: HashMap<String, String>,
|
||||
pgbouncer_ini_path: &str,
|
||||
) -> Result<()> {
|
||||
let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
|
||||
let section = conf.section_mut(Some("pgbouncer")).unwrap();
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
section.insert(option_name, value);
|
||||
}
|
||||
|
||||
conf.write_to_file(pgbouncer_ini_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tune pgbouncer.
|
||||
/// 1. Apply new config using pgbouncer admin console
|
||||
/// 2. Add new values to pgbouncer.ini to preserve them after restart
|
||||
pub async fn tune_pgbouncer(
|
||||
pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
pgbouncer_connstr: &str,
|
||||
pgbouncer_ini_path: Option<String>,
|
||||
) -> Result<()> {
|
||||
if let Some(pgbouncer_config) = pgbouncer_settings {
|
||||
// Apply new config
|
||||
let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
|
||||
let (client, connection) = connect_result.unwrap();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
for (option_name, value) in pgbouncer_config.iter() {
|
||||
info!(
|
||||
"Applying pgbouncer setting change: {} = {}",
|
||||
option_name, value
|
||||
);
|
||||
let query = format!("SET {} = {}", option_name, value);
|
||||
|
||||
let result = client.simple_query(&query).await;
|
||||
|
||||
info!("Applying pgbouncer setting change: {}", query);
|
||||
info!("pgbouncer setting change result: {:?}", result);
|
||||
|
||||
if let Err(err) = result {
|
||||
// Don't fail on error, just print it into log
|
||||
error!(
|
||||
"Failed to apply pgbouncer setting change: {}, {}",
|
||||
query, err
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
// save values to pgbouncer.ini
|
||||
// so that they are preserved after pgbouncer restart
|
||||
if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
|
||||
update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
|
||||
/// and send them to the logger. In the future we may also want to add context to
|
||||
/// these logs.
|
||||
pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
|
||||
std::thread::spawn(move || {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to build tokio runtime");
|
||||
|
||||
let res = runtime.block_on(async move {
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)?;
|
||||
handle_postgres_logs_async(stderr).await
|
||||
});
|
||||
if let Err(e) = res {
|
||||
tracing::error!("error while processing postgres logs: {}", e);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
|
||||
/// - next line starts with timestamp
|
||||
/// - EOF
|
||||
/// - no new lines were written for the last second
|
||||
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
|
||||
let mut lines = tokio::io::BufReader::new(stderr).lines();
|
||||
let timeout_duration = Duration::from_secs(1);
|
||||
let ts_regex =
|
||||
regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid");
|
||||
|
||||
let mut buf = vec![];
|
||||
loop {
|
||||
let next_line = timeout(timeout_duration, lines.next_line()).await;
|
||||
|
||||
// we should flush lines from the buffer if we cannot continue reading multiline message
|
||||
let should_flush_buf = match next_line {
|
||||
// Flushing if new line starts with timestamp
|
||||
Ok(Ok(Some(ref line))) => ts_regex.is_match(line),
|
||||
// Flushing on EOF, timeout or error
|
||||
_ => true,
|
||||
};
|
||||
|
||||
if !buf.is_empty() && should_flush_buf {
|
||||
// join multiline message into a single line, separated by unicode Zero Width Space.
|
||||
// "PG:" suffix is used to distinguish postgres logs from other logs.
|
||||
let combined = format!("PG:{}\n", buf.join("\u{200B}"));
|
||||
buf.clear();
|
||||
|
||||
// sync write to stderr to avoid interleaving with other logs
|
||||
use std::io::Write;
|
||||
let res = std::io::stderr().lock().write_all(combined.as_bytes());
|
||||
if let Err(e) = res {
|
||||
tracing::error!("error while writing to stderr: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// if not timeout, append line to the buffer
|
||||
if next_line.is_ok() {
|
||||
match next_line?? {
|
||||
Some(line) => buf.push(line),
|
||||
// EOF
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use reqwest::StatusCode;
|
||||
use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
|
||||
|
||||
use crate::config;
|
||||
use crate::logger::inlinify;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
@@ -371,49 +370,33 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn reassign_owned_objects_in_one_db(
|
||||
conf: Config,
|
||||
role_name: &PgIdent,
|
||||
db_owner: &PgIdent,
|
||||
) -> Result<()> {
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.pg_quote(),
|
||||
db_owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name,
|
||||
conf.get_dbname().unwrap_or(""),
|
||||
db_owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Reassign all owned objects in all databases to the owner of the database.
|
||||
fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
|
||||
for db in &spec.cluster.databases {
|
||||
if db.owner != *role_name {
|
||||
let mut conf = Config::from_str(connstr)?;
|
||||
conf.dbname(&db.name);
|
||||
reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
|
||||
|
||||
let mut client = conf.connect(NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.pg_quote(),
|
||||
db.owner.pg_quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name, &db.name, &db.owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Also handle case when there are no databases in the spec.
|
||||
// In this case we need to reassign objects in the default database.
|
||||
let conf = Config::from_str(connstr)?;
|
||||
let db_owner = PgIdent::from_str("cloud_admin")?;
|
||||
reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -663,11 +646,7 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
|
||||
$$;"
|
||||
.to_string();
|
||||
|
||||
info!(
|
||||
"grant query for db {} : {}",
|
||||
&db.name,
|
||||
inlinify(&grant_query)
|
||||
);
|
||||
info!("grant query for db {} : {}", &db.name, &grant_query);
|
||||
db_client.simple_query(&grant_query)?;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,11 +6,11 @@
|
||||
//! rely on `neon_local` to set up the environment for each test.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::tenant_migration::migrate_tenant;
|
||||
@@ -338,7 +338,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
|
||||
let mut env =
|
||||
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
||||
let force = init_match.get_one("force").expect("we set a default value");
|
||||
let force = init_match.get_flag("force");
|
||||
env.init(pg_version, force)
|
||||
.context("Failed to initialize neon repository")?;
|
||||
|
||||
@@ -1266,15 +1266,9 @@ fn cli() -> Command {
|
||||
.required(false);
|
||||
|
||||
let force_arg = Arg::new("force")
|
||||
.value_parser(value_parser!(InitForceMode))
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("force")
|
||||
.default_value(
|
||||
InitForceMode::MustNotExist
|
||||
.to_possible_value()
|
||||
.unwrap()
|
||||
.get_name()
|
||||
.to_owned(),
|
||||
)
|
||||
.action(ArgAction::SetTrue)
|
||||
.help("Force initialization even if the repository is not empty")
|
||||
.required(false);
|
||||
|
||||
|
||||
@@ -46,8 +46,6 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
@@ -441,14 +439,11 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
||||
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
if send_sigterm {
|
||||
kill(pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -542,7 +537,6 @@ impl Endpoint {
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -738,15 +732,10 @@ impl Endpoint {
|
||||
&None,
|
||||
)?;
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some
|
||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||
// etc.
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
//
|
||||
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
|
||||
// want this cleanup: tests intentionally do stop when majority of
|
||||
// safekeepers is down, so sync-safekeepers would hang otherwise. This
|
||||
// could be a separate flag though.
|
||||
self.wait_for_compute_ctl_to_exit(destroy)?;
|
||||
self.wait_for_compute_ctl_to_exit()?;
|
||||
if destroy {
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
|
||||
use clap::ValueEnum;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -163,31 +162,6 @@ impl Default for SafekeeperConf {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum InitForceMode {
|
||||
MustNotExist,
|
||||
EmptyDirOk,
|
||||
RemoveAllContents,
|
||||
}
|
||||
|
||||
impl ValueEnum for InitForceMode {
|
||||
fn value_variants<'a>() -> &'a [Self] {
|
||||
&[
|
||||
Self::MustNotExist,
|
||||
Self::EmptyDirOk,
|
||||
Self::RemoveAllContents,
|
||||
]
|
||||
}
|
||||
|
||||
fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
|
||||
Some(clap::builder::PossibleValue::new(match self {
|
||||
InitForceMode::MustNotExist => "must-not-exist",
|
||||
InitForceMode::EmptyDirOk => "empty-dir-ok",
|
||||
InitForceMode::RemoveAllContents => "remove-all-contents",
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
impl SafekeeperConf {
|
||||
/// Compute is served by port on which only tenant scoped tokens allowed, if
|
||||
/// it is configured.
|
||||
@@ -410,7 +384,7 @@ impl LocalEnv {
|
||||
//
|
||||
// Initialize a new Neon repository
|
||||
//
|
||||
pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
|
||||
pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
|
||||
// check if config already exists
|
||||
let base_path = &self.base_data_dir;
|
||||
ensure!(
|
||||
@@ -419,34 +393,25 @@ impl LocalEnv {
|
||||
);
|
||||
|
||||
if base_path.exists() {
|
||||
match force {
|
||||
InitForceMode::MustNotExist => {
|
||||
bail!(
|
||||
"directory '{}' already exists. Perhaps already initialized?",
|
||||
base_path.display()
|
||||
);
|
||||
}
|
||||
InitForceMode::EmptyDirOk => {
|
||||
if let Some(res) = std::fs::read_dir(base_path)?.next() {
|
||||
res.context("check if directory is empty")?;
|
||||
anyhow::bail!("directory not empty: {base_path:?}");
|
||||
}
|
||||
}
|
||||
InitForceMode::RemoveAllContents => {
|
||||
println!("removing all contents of '{}'", base_path.display());
|
||||
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
|
||||
// all contents inside. This helps if the developer symbol links another directory (i.e.,
|
||||
// S3 local SSD) to the `.neon` base directory.
|
||||
for entry in std::fs::read_dir(base_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
fs::remove_dir_all(&path)?;
|
||||
} else {
|
||||
fs::remove_file(&path)?;
|
||||
}
|
||||
if force {
|
||||
println!("removing all contents of '{}'", base_path.display());
|
||||
// instead of directly calling `remove_dir_all`, we keep the original dir but removing
|
||||
// all contents inside. This helps if the developer symbol links another directory (i.e.,
|
||||
// S3 local SSD) to the `.neon` base directory.
|
||||
for entry in std::fs::read_dir(base_path)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_dir() {
|
||||
fs::remove_dir_all(&path)?;
|
||||
} else {
|
||||
fs::remove_file(&path)?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bail!(
|
||||
"directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
|
||||
base_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -485,13 +485,6 @@ impl PageServerNode {
|
||||
Ok(self.http_client.list_timelines(*tenant_id).await?)
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_secondary_download(*tenant_id)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
|
||||
@@ -11,7 +11,6 @@ use crate::{
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
@@ -41,9 +40,9 @@ async fn await_lsn(
|
||||
loop {
|
||||
let latest = match get_lsns(tenant_id, pageserver).await {
|
||||
Ok(l) => l,
|
||||
Err(_e) => {
|
||||
Err(e) => {
|
||||
println!(
|
||||
"🕑 Waiting for pageserver {} to activate...",
|
||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
||||
pageserver.conf.id
|
||||
);
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
@@ -90,7 +89,7 @@ pub async fn migrate_tenant(
|
||||
tenant_id: TenantId,
|
||||
dest_ps: PageServerNode,
|
||||
) -> anyhow::Result<()> {
|
||||
println!("🤔 Checking existing status...");
|
||||
// Get a new generation
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
|
||||
fn build_location_config(
|
||||
@@ -136,20 +135,6 @@ pub async fn migrate_tenant(
|
||||
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
|
||||
}
|
||||
|
||||
println!(
|
||||
"🔁 Downloading latest layers to destination pageserver {}",
|
||||
dest_ps.conf.id
|
||||
);
|
||||
match dest_ps
|
||||
.tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
Err(_) => {
|
||||
println!(" (skipping, destination wasn't in secondary mode)")
|
||||
}
|
||||
}
|
||||
|
||||
let gen = attachment_service
|
||||
.attach_hook(tenant_id, dest_ps.conf.id)
|
||||
.await?;
|
||||
|
||||
@@ -35,7 +35,6 @@ allow = [
|
||||
"Artistic-2.0",
|
||||
"BSD-2-Clause",
|
||||
"BSD-3-Clause",
|
||||
"CC0-1.0",
|
||||
"ISC",
|
||||
"MIT",
|
||||
"MPL-2.0",
|
||||
|
||||
@@ -1,197 +0,0 @@
|
||||
# Per-Tenant GetPage@LSN Throttling
|
||||
|
||||
Author: Christian Schwarz
|
||||
Date: Oct 24, 2023
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver
|
||||
and the interactions with its client, i.e., the neon_smgr component in Compute.
|
||||
|
||||
The result of implementing & executing this RFC will be a fleet-wide upper limit for
|
||||
**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**.
|
||||
|
||||
## Background
|
||||
|
||||
### GetPage@LSN Request Flow
|
||||
|
||||
Pageserver exposes its `page_service.rs` as a libpq listener.
|
||||
The Computes' `neon_smgr` module connects to that libpq listener.
|
||||
Once a connection is established, the protocol allows Compute to request page images at a given LSN.
|
||||
We call these requests GetPage@LSN requests, or GetPage requests for short.
|
||||
Other request types can be sent, but these are low traffic compared to GetPage requests
|
||||
and are not the concern of this RFC.
|
||||
|
||||
Pageserver associates one libpq connection with one tokio task.
|
||||
|
||||
Per connection/task, the pq protocol is handled by the common `postgres_backend` crate.
|
||||
Its `run_message_loop` function invokes the `page_service` specific `impl<IO> postgres_backend::Handler<IO> for PageServerHandler`.
|
||||
Requests are processed in the order in which they arrive via the TCP-based pq protocol.
|
||||
So, there is no concurrent request processing within one connection/task.
|
||||
|
||||
There is a degree of natural pipelining:
|
||||
Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream.
|
||||
And Pageserver can fill the pipe with responses in the other direction.
|
||||
Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc.
|
||||
|
||||
### GetPage@LSN Access Pattern
|
||||
|
||||
The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC).
|
||||
Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches.
|
||||
|
||||
If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*.
|
||||
|
||||
## Motivation
|
||||
|
||||
In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h,
|
||||
then dropping to ca 18k GetPage/second for a period of 9h.
|
||||
|
||||
We noticed this because of an internal GetPage latency SLO burn rate alert, i.e.,
|
||||
the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO.
|
||||
|
||||
Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants.
|
||||
|
||||
However, here are some illustrative data points for the 155k period:
|
||||
The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance.
|
||||
We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`)
|
||||
The CPU utilization of the instance was 75% user+system.
|
||||
Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%.
|
||||
The hit rate for materialized pages was ca. 40%.
|
||||
Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100.
|
||||
|
||||
The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**.
|
||||
The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM.
|
||||
The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**.
|
||||
|
||||
My claim is that it was **unhealthy to serve this workload at the pace we did**:
|
||||
* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this)
|
||||
* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons:
|
||||
* **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops.
|
||||
At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit.
|
||||
The result is an **uneven** performance profile from the Compute perspective.
|
||||
|
||||
* **economics**: Neon currently does not charge for IOPS, only capacity.
|
||||
**We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.**
|
||||
For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume.
|
||||
Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume.
|
||||
We charge 0$.
|
||||
It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free.
|
||||
|
||||
|
||||
Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits:
|
||||
vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver.
|
||||
So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity.
|
||||
|
||||
## Solution: Throttling GetPage Requests
|
||||
|
||||
**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**.
|
||||
That is, unless we want to start charging for provisioned GetPage@LSN/second.
|
||||
Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size.
|
||||
Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913).
|
||||
|
||||
## The Design Space
|
||||
|
||||
What that remains is the question about *policy* and *mechanism*:
|
||||
|
||||
**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant.
|
||||
Candidates are:
|
||||
|
||||
* hard limit, same limit value per connection|timeline|tenant
|
||||
* Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance.
|
||||
This is a major operational pain point / risk right now.
|
||||
* hard limit, configurable per connection|timeline|tenant
|
||||
* This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers.
|
||||
* Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant.
|
||||
* fair share among active connections|timelines|tenants per instance
|
||||
* example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity
|
||||
* NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance
|
||||
* ...
|
||||
|
||||
|
||||
Regarding **mechanism**, it's clear that **backpressure** is the way to go.
|
||||
However, we must choose between
|
||||
* **implicit** backpressure through pq/TCP and
|
||||
* **explicit** rejection of requests + retries with exponential backoff
|
||||
|
||||
Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**:
|
||||
where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling?
|
||||
|
||||
And when we eventually move the measurement point into the Computes (to avoid coordinated omission),
|
||||
how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO?
|
||||
|
||||
## Scope Of This RFC
|
||||
|
||||
**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**.
|
||||
|
||||
This proposal is easy to implement and significantly de-risks operating large Pageservers,
|
||||
based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants.
|
||||
|
||||
For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate.
|
||||
Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate.
|
||||
If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation.
|
||||
|
||||
The mechanism for backpressure will be TCP-based implicit backpressure.
|
||||
The compute team isn't concerned about prefetch queue depth.
|
||||
Pageserver will implement it by delaying the reading of requests from the libpq connection(s).
|
||||
|
||||
The rate limit will be implemented using a per-tenant token bucket.
|
||||
The bucket will be be shared among all connections to the tenant.
|
||||
The bucket implementation supports starvation-preventing `await`ing.
|
||||
The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/).
|
||||
The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771
|
||||
can be used to evaluate the overhead of sharing the bucket among connections of a tenant.
|
||||
A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler.
|
||||
|
||||
Regarding metrics / the internal GetPage latency SLO:
|
||||
we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by:
|
||||
- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver)
|
||||
- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver)
|
||||
|
||||
Further observability measures:
|
||||
- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute.
|
||||
The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats.
|
||||
|
||||
Rollout will happen as follows:
|
||||
- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf
|
||||
- experimentation in staging and later production to study impact & interaction with auto-scaling
|
||||
- determination of a sensible global default value
|
||||
- the value will be chosen as high as possible ...
|
||||
- ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance.
|
||||
- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default
|
||||
- reset of the experimental per-tenant overrides
|
||||
- gain experience & lower the limit over time
|
||||
- we stop lowering the limit as soon as this RFC's goal is achieved, i.e.,
|
||||
once we decide that in practice the chosen value sufficiently de-risks operating large pageservers
|
||||
|
||||
The per-tenant override will remain for emergencies and testing.
|
||||
But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant.
|
||||
|
||||
Toward the upper layers of the Neon stack, the resulting limit will be
|
||||
**"the highest GetPage/second that Pageserver can support for a single tenant"**.
|
||||
|
||||
### Rationale
|
||||
|
||||
We decided against error + retry because of worries about starvation.
|
||||
|
||||
## Future Work
|
||||
|
||||
Enable per-tenant emergency override of the limit via Console.
|
||||
Should be part of a more general framework to specify tenant config overrides.
|
||||
**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users,
|
||||
or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that
|
||||
concerns itself with GetPage/second capacity planning.
|
||||
|
||||
Compute-side metrics for GetPage latency.
|
||||
|
||||
Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled.
|
||||
|
||||
Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss.
|
||||
|
||||
Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant.
|
||||
Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance.
|
||||
With per-tenant rate limiting, we will not meet that expectation.
|
||||
However, we can currently only scale per tenant.
|
||||
Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis.
|
||||
But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit.
|
||||
To solve this properly, I think we'll need replicas for tenants / shard.
|
||||
To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas.
|
||||
@@ -1,142 +0,0 @@
|
||||
# Vectored Timeline Get
|
||||
|
||||
Created on: 2024-01-02
|
||||
Author: Christian Schwarz
|
||||
|
||||
# Summary
|
||||
|
||||
A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
|
||||
|
||||
# Motivation
|
||||
|
||||
During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
|
||||
For an example, see
|
||||
https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302.
|
||||
|
||||
Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example).
|
||||
For each layer visited by layer map traversal, we do a `DiskBtree` point lookup.
|
||||
If it's negative (no entry), we resume layer map traversal.
|
||||
If it's positive, we collect the result in our reconstruct data bag.
|
||||
If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo.
|
||||
Otherwise, we resume layer map traversal.
|
||||
|
||||
Doing this many `Timeline::get` calls is quite inefficient because:
|
||||
|
||||
1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack.
|
||||
2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys.
|
||||
This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but
|
||||
may not contain the data we're looking for.
|
||||
3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1].
|
||||
So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do.
|
||||
The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk.
|
||||
|
||||
[^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9
|
||||
|
||||
# Solution
|
||||
|
||||
We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API unlocks:
|
||||
|
||||
* more efficient basebackup
|
||||
* batched IO during compaction (useful for strides of unchanged pages)
|
||||
* page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch)
|
||||
* if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API
|
||||
|
||||
# DoD
|
||||
|
||||
There is a new variant of `Timeline::get`, called `Timeline::get_vectored`.
|
||||
It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`.
|
||||
|
||||
It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images.
|
||||
It is sufficient to simply return a `Vec<Bytes>`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`.
|
||||
|
||||
Functionally, the behavior of `Timeline::get_vectored` is equivalent to
|
||||
|
||||
```rust
|
||||
let mut keys_iter: impl Iterator<Item=Key>
|
||||
= src.map(|KeyVec{ base, count }| (base..base+count)).flatten();
|
||||
let mut out = Vec::new();
|
||||
for key in keys_iter {
|
||||
let data = Timeline::get(key, lsn)?;
|
||||
out.push(data);
|
||||
}
|
||||
return out;
|
||||
```
|
||||
|
||||
However, unlike above, an ideal solution will
|
||||
|
||||
* Visit each `struct Layer` at most once.
|
||||
* For each visited layer, call `Layer::get_value_reconstruct_data` at most once.
|
||||
* This means, read each `DiskBtree` page at most once.
|
||||
* Facilitate merging of the reads we issue to the OS and eventually NVMe.
|
||||
|
||||
Each of these items above represents a signficant amount of work.
|
||||
|
||||
## Performance
|
||||
|
||||
Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`.
|
||||
A reasonable constant overhead over current `Timeline::get` is acceptable.
|
||||
|
||||
The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments.
|
||||
|
||||
# Implementation
|
||||
|
||||
High-level set of tasks / changes to be made:
|
||||
|
||||
- **Get clarity on API**:
|
||||
- Define naive `Timeline::get_vectored` implementation & adopt it across pageserver.
|
||||
- The tricky thing here will be the return type (e.g. `Vec<Bytes>` vs `impl Stream`).
|
||||
- Start with something simple to explore the different usages of the API.
|
||||
Then iterate with peers until we have something that is good enough.
|
||||
- **Vectored Layer Map traversal**
|
||||
- Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`)
|
||||
- Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1
|
||||
- The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385)
|
||||
but need more.
|
||||
Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data.
|
||||
- **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`**
|
||||
- Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384).
|
||||
- Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load.
|
||||
- Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call.
|
||||
- What needs to happen to `DiskBtree::visit()`?
|
||||
* Minimally
|
||||
* take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit.
|
||||
* Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range
|
||||
* This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous.
|
||||
* Ideally:
|
||||
* Take a `&[KeyVec]`, sort it;
|
||||
* during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out.
|
||||
* NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`".
|
||||
- **Facilitate merging of the reads we issue to the OS and eventually NVMe.**
|
||||
- The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
|
||||
- [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
|
||||
- We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure)
|
||||
- [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435)
|
||||
- What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**.
|
||||
- That is tricky because
|
||||
- the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824)
|
||||
- there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`.
|
||||
- The guiding principle here should be to avoid coupling this work to the `PageCache`.
|
||||
- I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management.
|
||||
|
||||
|
||||
Let's see how we can improve by doing the first three items in above list first, then revisit.
|
||||
|
||||
## Rollout / Feature Flags
|
||||
|
||||
No feature flags are required for this epic.
|
||||
|
||||
At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change.
|
||||
|
||||
It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks.
|
||||
That will help isolate performance regressions across weekly releases.
|
||||
|
||||
# Interaction With Sharding
|
||||
|
||||
[Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`.
|
||||
|
||||
Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard.
|
||||
|
||||
Given that this is already the case, there shouldn't be significant interaction/interference with sharding.
|
||||
|
||||
However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction.
|
||||
For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint.
|
||||
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
|
||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||
|
||||
### Obligatory checks
|
||||
We force code formatting via `ruff`, and type hints via `mypy`.
|
||||
We force code formatting via `black`, `ruff`, and type hints via `mypy`.
|
||||
Run the following commands in the repository's root (next to `pyproject.toml`):
|
||||
|
||||
```bash
|
||||
poetry run ruff format . # All code is reformatted
|
||||
poetry run ruff check . # Python linter
|
||||
poetry run mypy . # Ensure there are no typing errors
|
||||
poetry run black . # All code is reformatted
|
||||
poetry run ruff . # Python linter
|
||||
poetry run mypy . # Ensure there are no typing errors
|
||||
```
|
||||
|
||||
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
||||
|
||||
@@ -73,8 +73,6 @@ pub struct ComputeSpec {
|
||||
|
||||
// information about available remote extensions
|
||||
pub remote_extensions: Option<RemoteExtSpec>,
|
||||
|
||||
pub pgbouncer_settings: Option<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -243,9 +243,5 @@
|
||||
"public_extensions": [
|
||||
"postgis"
|
||||
]
|
||||
},
|
||||
"pgbouncer_settings": {
|
||||
"default_pool_size": "42",
|
||||
"pool_mode": "session"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -141,9 +141,8 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
|
||||
@@ -114,21 +114,16 @@ impl KeySpaceAccum {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn add_key(&mut self, key: Key) {
|
||||
self.add_range(singleton_range(key))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
match self.accum.as_mut() {
|
||||
Some(accum) => {
|
||||
if range.start == accum.end {
|
||||
accum.end = range.end;
|
||||
} else {
|
||||
// TODO: to efficiently support small sharding stripe sizes, we should avoid starting
|
||||
// a new range here if the skipped region was all keys that don't belong on this shard.
|
||||
// (https://github.com/neondatabase/neon/issues/6247)
|
||||
assert!(range.start > accum.end);
|
||||
self.ranges.push(accum.clone());
|
||||
*accum = range;
|
||||
|
||||
@@ -2,7 +2,6 @@ pub mod partitioning;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
time::SystemTime,
|
||||
};
|
||||
@@ -20,7 +19,7 @@ use utils::{
|
||||
|
||||
use crate::{reltag::RelTag, shard::TenantShardId};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
///
|
||||
@@ -370,14 +369,6 @@ pub struct TenantInfo {
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantDetails {
|
||||
#[serde(flatten)]
|
||||
pub tenant_info: TenantInfo,
|
||||
|
||||
pub timelines: Vec<TimelineId>,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
@@ -557,6 +548,19 @@ pub enum DownloadRemoteLayersTaskState {
|
||||
ShutDown,
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
@@ -572,7 +576,6 @@ pub enum PagestreamFeMessage {
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
#[derive(strum_macros::EnumProperty)]
|
||||
pub enum PagestreamBeMessage {
|
||||
Exists(PagestreamExistsResponse),
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
@@ -581,29 +584,6 @@ pub enum PagestreamBeMessage {
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
}
|
||||
|
||||
// Keep in sync with `pagestore_client.h`
|
||||
#[repr(u8)]
|
||||
enum PagestreamBeMessageTag {
|
||||
Exists = 100,
|
||||
Nblocks = 101,
|
||||
GetPage = 102,
|
||||
Error = 103,
|
||||
DbSize = 104,
|
||||
}
|
||||
impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
type Error = u8;
|
||||
fn try_from(value: u8) -> Result<Self, u8> {
|
||||
match value {
|
||||
100 => Ok(PagestreamBeMessageTag::Exists),
|
||||
101 => Ok(PagestreamBeMessageTag::Nblocks),
|
||||
102 => Ok(PagestreamBeMessageTag::GetPage),
|
||||
103 => Ok(PagestreamBeMessageTag::Error),
|
||||
104 => Ok(PagestreamBeMessageTag::DbSize),
|
||||
_ => Err(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct PagestreamExistsRequest {
|
||||
pub latest: bool,
|
||||
@@ -759,92 +739,35 @@ impl PagestreamBeMessage {
|
||||
pub fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
match self {
|
||||
Self::Exists(resp) => {
|
||||
bytes.put_u8(Tag::Exists as u8);
|
||||
bytes.put_u8(100); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.exists as u8);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(Tag::Nblocks as u8);
|
||||
bytes.put_u8(101); /* tag from pagestore_client.h */
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::GetPage(resp) => {
|
||||
bytes.put_u8(Tag::GetPage as u8);
|
||||
bytes.put_u8(102); /* tag from pagestore_client.h */
|
||||
bytes.put(&resp.page[..]);
|
||||
}
|
||||
|
||||
Self::Error(resp) => {
|
||||
bytes.put_u8(Tag::Error as u8);
|
||||
bytes.put_u8(103); /* tag from pagestore_client.h */
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(Tag::DbSize as u8);
|
||||
bytes.put_u8(104); /* tag from pagestore_client.h */
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
|
||||
let mut buf = buf.reader();
|
||||
let msg_tag = buf.read_u8()?;
|
||||
|
||||
use PagestreamBeMessageTag as Tag;
|
||||
let ok =
|
||||
match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
|
||||
Tag::Exists => {
|
||||
let exists = buf.read_u8()?;
|
||||
Self::Exists(PagestreamExistsResponse {
|
||||
exists: exists != 0,
|
||||
})
|
||||
}
|
||||
Tag::Nblocks => {
|
||||
let n_blocks = buf.read_u32::<BigEndian>()?;
|
||||
Self::Nblocks(PagestreamNblocksResponse { n_blocks })
|
||||
}
|
||||
Tag::GetPage => {
|
||||
let mut page = vec![0; 8192]; // TODO: use MaybeUninit
|
||||
buf.read_exact(&mut page)?;
|
||||
PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
|
||||
}
|
||||
Tag::Error => {
|
||||
let mut msg = Vec::new();
|
||||
buf.read_until(0, &mut msg)?;
|
||||
let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
|
||||
let rust_str = cstring.to_str()?;
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: rust_str.to_owned(),
|
||||
})
|
||||
}
|
||||
Tag::DbSize => {
|
||||
let db_size = buf.read_i64::<BigEndian>()?;
|
||||
Self::DbSize(PagestreamDbSizeResponse { db_size })
|
||||
}
|
||||
};
|
||||
let remaining = buf.into_inner();
|
||||
if !remaining.is_empty() {
|
||||
anyhow::bail!(
|
||||
"remaining bytes in msg with tag={msg_tag}: {}",
|
||||
remaining.len()
|
||||
);
|
||||
}
|
||||
Ok(ok)
|
||||
}
|
||||
|
||||
pub fn kind(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Exists(_) => "Exists",
|
||||
Self::Nblocks(_) => "Nblocks",
|
||||
Self::GetPage(_) => "GetPage",
|
||||
Self::Error(_) => "Error",
|
||||
Self::DbSize(_) => "DbSize",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -81,10 +81,6 @@ impl TenantShardId {
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
@@ -163,7 +159,7 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||
/// TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
@@ -422,21 +418,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
// A: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
false
|
||||
} else {
|
||||
!self.is_key_local(key)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
@@ -530,7 +511,12 @@ fn key_is_shard0(key: &Key) -> bool {
|
||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||
// requests, and any request other than those for particular blocks in relations.
|
||||
!is_rel_block_key(key)
|
||||
//
|
||||
// In this condition:
|
||||
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
|
||||
// all metadata.
|
||||
// - field6 is set to -1 for relation size pages.
|
||||
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
|
||||
@@ -35,12 +35,6 @@ pub enum QueryError {
|
||||
/// We were instructed to shutdown while processing the query
|
||||
#[error("Shutting down")]
|
||||
Shutdown,
|
||||
/// Query handler indicated that client should reconnect
|
||||
#[error("Server requested reconnect")]
|
||||
Reconnect,
|
||||
/// Query named an entity that was not found
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(std::borrow::Cow<'static, str>),
|
||||
/// Authentication failure
|
||||
#[error("Unauthorized: {0}")]
|
||||
Unauthorized(std::borrow::Cow<'static, str>),
|
||||
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
|
||||
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
|
||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||
Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
|
||||
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
info!("Stopped due to shutdown");
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Reconnect) => {
|
||||
// Dropping out of this loop implicitly disconnects
|
||||
info!("Stopped due to handler reconnect request");
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Disconnected(e)) => {
|
||||
info!("Disconnected ({e:#})");
|
||||
// Disconnection is not an error: we just use it that way internally to drop
|
||||
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
|
||||
pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Reconnect => "reconnect".to_string(),
|
||||
QueryError::Shutdown => "shutdown".to_string(),
|
||||
QueryError::NotFound(_) => "not found".to_string(),
|
||||
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
||||
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
|
||||
QueryError::SimulatedConnectionError => {
|
||||
error!("query handler for query '{query}' failed due to a simulated connection error")
|
||||
}
|
||||
QueryError::Reconnect => {
|
||||
info!("query handler for '{query}' requested client to reconnect")
|
||||
}
|
||||
QueryError::Shutdown => {
|
||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||
}
|
||||
QueryError::NotFound(reason) => {
|
||||
info!("query handler for '{query}' entity not found: {reason}")
|
||||
}
|
||||
QueryError::Unauthorized(e) => {
|
||||
warn!("query handler for '{query}' failed with authentication error: {e}");
|
||||
}
|
||||
|
||||
@@ -117,8 +117,6 @@ impl AzureBlobStorage {
|
||||
) -> Result<Download, DownloadError> {
|
||||
let mut response = builder.into_stream();
|
||||
|
||||
let mut etag = None;
|
||||
let mut last_modified = None;
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
@@ -126,13 +124,6 @@ impl AzureBlobStorage {
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
@@ -145,8 +136,6 @@ impl AzureBlobStorage {
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
last_modified,
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
@@ -322,12 +311,6 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
|
||||
Err(anyhow::anyhow!(
|
||||
"copy for azure blob storage is not implemented"
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
|
||||
@@ -14,9 +14,7 @@ mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
|
||||
use std::{
|
||||
collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
|
||||
};
|
||||
use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -207,18 +205,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||
|
||||
/// Copy a remote object inside a bucket from one path to another.
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
pub struct Download {
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
pub last_modified: Option<SystemTime>,
|
||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||
pub etag: Option<String>,
|
||||
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
@@ -377,15 +367,6 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.copy(from, to).await,
|
||||
Self::AwsS3(s) => s.copy(from, to).await,
|
||||
Self::AzureBlob(s) => s.copy(from, to).await,
|
||||
Self::Unreliable(s) => s.copy(from, to).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
@@ -672,7 +653,6 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::Put => &self.write,
|
||||
RequestKind::List => &self.read,
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -331,8 +331,6 @@ impl RemoteStorage for LocalFs {
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
@@ -374,17 +372,17 @@ impl RemoteStorage for LocalFs {
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download_stream: DownloadStream = match end_exclusive {
|
||||
Some(end_exclusive) => Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
None => Box::pin(ReaderStream::new(source)),
|
||||
};
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream,
|
||||
Ok(match end_exclusive {
|
||||
Some(end_exclusive) => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
},
|
||||
None => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(ReaderStream::new(source)),
|
||||
},
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
@@ -409,20 +407,6 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let from_path = from.with_base(&self.storage_root);
|
||||
let to_path = to.with_base(&self.storage_root);
|
||||
create_target_directory(&to_path).await?;
|
||||
fs::copy(&from_path, &to_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to copy file from '{from_path}' to '{to_path}'",
|
||||
from_path = from_path,
|
||||
to_path = to_path
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
|
||||
@@ -16,7 +16,6 @@ use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
meta::credentials::CredentialsProviderChain,
|
||||
profile::ProfileFileCredentialsProvider,
|
||||
provider_config::ProviderConfig,
|
||||
retry::{RetryConfigBuilder, RetryMode},
|
||||
web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
@@ -75,29 +74,20 @@ impl S3Bucket {
|
||||
|
||||
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
||||
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
|
||||
.or_else(
|
||||
"profile-sso",
|
||||
ProfileFileCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else(
|
||||
"token",
|
||||
.or_else("token", {
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build(),
|
||||
)
|
||||
.build()
|
||||
})
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
@@ -228,11 +218,17 @@ impl S3Bucket {
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
if get_object.is_err() {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
}
|
||||
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output.e_tag.clone();
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
@@ -241,33 +237,15 @@ impl S3Bucket {
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
etag,
|
||||
last_modified,
|
||||
download_stream: Box::pin(body),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
))
|
||||
}
|
||||
Err(e) => Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -493,38 +471,6 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Copy;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
// we need to specify bucket_name as a prefix
|
||||
let copy_source = format!(
|
||||
"{}/{}",
|
||||
self.bucket_name,
|
||||
self.relative_path_to_s3_object(from)
|
||||
);
|
||||
|
||||
let res = self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(to))
|
||||
.copy_source(copy_source)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
res?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
// if prefix is not none then download file `prefix/from`
|
||||
// if prefix is none then download file `from`
|
||||
|
||||
@@ -11,7 +11,6 @@ pub(crate) enum RequestKind {
|
||||
Put = 1,
|
||||
Delete = 2,
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
}
|
||||
|
||||
use RequestKind::*;
|
||||
@@ -23,7 +22,6 @@ impl RequestKind {
|
||||
Put => "put_object",
|
||||
Delete => "delete_object",
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -31,7 +29,7 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 5]);
|
||||
pub(super) struct RequestTyped<C>([C; 4]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -40,8 +38,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy].into_iter();
|
||||
let arr = std::array::from_fn::<C, 5, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List].into_iter();
|
||||
let arr = std::array::from_fn::<C, 4, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -162,11 +162,4 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
// copy is equivalent to download + upload
|
||||
self.attempt(RemoteOp::Download(from.clone()))?;
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.copy_object(from, to).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
use std::collections::HashSet;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{Download, GenericRemoteStorage, RemotePath};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
pub(crate) fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
pub(crate) fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy_buf(
|
||||
&mut tokio_util::io::StreamReader::new(dl.download_stream),
|
||||
&mut buf,
|
||||
)
|
||||
.await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
pub(crate) async fn upload_simple_remote_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn cleanup(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
objects_to_delete: HashSet<RemotePath>,
|
||||
) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) struct Uploads {
|
||||
pub(crate) prefixes: HashSet<RemotePath>,
|
||||
pub(crate) blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
pub(crate) async fn upload_remote_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
@@ -2,23 +2,23 @@ use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
};
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tracing::{debug, info};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
mod common;
|
||||
|
||||
use common::{
|
||||
cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
|
||||
upload_stream, wrap_stream,
|
||||
};
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
|
||||
|
||||
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
|
||||
/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
|
||||
/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
@@ -97,7 +97,7 @@ async fn azure_pagination_should_work(
|
||||
/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `Azure_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||
/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
@@ -218,9 +218,18 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
|
||||
async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy_buf(
|
||||
&mut tokio_util::io::StreamReader::new(dl.download_stream),
|
||||
&mut buf,
|
||||
)
|
||||
.await?;
|
||||
Ok(buf)
|
||||
}
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
@@ -228,12 +237,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
@@ -241,17 +250,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
@@ -263,6 +272,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
|
||||
struct EnabledAzure {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
@@ -332,7 +352,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
|
||||
|
||||
let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -394,7 +414,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
|
||||
|
||||
let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
|
||||
match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -458,3 +478,166 @@ fn create_azure_client(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_azure_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} Azure files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_azure_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} Azure files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
|
||||
// to binary
|
||||
fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
@@ -2,23 +2,23 @@ use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tracing::{debug, info};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
mod common;
|
||||
|
||||
use common::{
|
||||
cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
|
||||
upload_stream, wrap_stream,
|
||||
};
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
|
||||
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
|
||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
@@ -95,7 +95,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `s3_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
@@ -198,65 +198,15 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let MaybeEnabledS3::Enabled(ctx) = ctx else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
||||
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete(&path)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
Ok(())
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
|
||||
struct EnabledS3 {
|
||||
@@ -328,7 +278,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
|
||||
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -390,7 +340,7 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
|
||||
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
|
||||
match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
@@ -453,3 +403,166 @@ fn create_s3_client(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
@@ -51,9 +51,3 @@ pub struct SkTimelineInfo {
|
||||
#[serde(default)]
|
||||
pub http_connstr: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct TimelineCopyRequest {
|
||||
pub target_timeline_id: TimelineId,
|
||||
pub until_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -4,12 +4,6 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
@@ -22,7 +16,6 @@ chrono.workspace = true
|
||||
heapless.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
nix.workspace = true
|
||||
|
||||
@@ -1,177 +0,0 @@
|
||||
//! Failpoint support code shared between pageserver and safekeepers.
|
||||
|
||||
use crate::http::{
|
||||
error::ApiError,
|
||||
json::{json_request, json_response},
|
||||
};
|
||||
use hyper::{Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
///
|
||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||
/// specified time (in milliseconds). The main difference is that we use async
|
||||
/// tokio sleep function. Another difference is that we print lines to the log,
|
||||
/// which can be useful in tests to check that the failpoint was hit.
|
||||
///
|
||||
/// Optionally pass a cancellation token, and this failpoint will drop out of
|
||||
/// its sleep when the cancellation token fires. This is useful for testing
|
||||
/// cases where we would like to block something, but test its clean shutdown behavior.
|
||||
#[macro_export]
|
||||
macro_rules! __failpoint_sleep_millis_async {
|
||||
($name:literal) => {{
|
||||
// If the failpoint is used with a "return" action, set should_sleep to the
|
||||
// returned value (as string). Otherwise it's set to None.
|
||||
let should_sleep = (|| {
|
||||
::fail::fail_point!($name, |x| x);
|
||||
::std::option::Option::None
|
||||
})();
|
||||
|
||||
// Sleep if the action was a returned value
|
||||
if let ::std::option::Option::Some(duration_str) = should_sleep {
|
||||
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
|
||||
}
|
||||
}};
|
||||
($name:literal, $cancel:expr) => {{
|
||||
// If the failpoint is used with a "return" action, set should_sleep to the
|
||||
// returned value (as string). Otherwise it's set to None.
|
||||
let should_sleep = (|| {
|
||||
::fail::fail_point!($name, |x| x);
|
||||
::std::option::Option::None
|
||||
})();
|
||||
|
||||
// Sleep if the action was a returned value
|
||||
if let ::std::option::Option::Some(duration_str) = should_sleep {
|
||||
$crate::failpoint_support::failpoint_sleep_cancellable_helper(
|
||||
$name,
|
||||
duration_str,
|
||||
$cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}};
|
||||
}
|
||||
pub use __failpoint_sleep_millis_async as sleep_millis_async;
|
||||
|
||||
// Helper function used by the macro. (A function has nicer scoping so we
|
||||
// don't need to decorate everything with "::")
|
||||
#[doc(hidden)]
|
||||
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||
let millis = duration_str.parse::<u64>().unwrap();
|
||||
let d = std::time::Duration::from_millis(millis);
|
||||
|
||||
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
|
||||
tokio::time::sleep(d).await;
|
||||
tracing::info!("failpoint {:?}: sleep done", name);
|
||||
}
|
||||
|
||||
// Helper function used by the macro. (A function has nicer scoping so we
|
||||
// don't need to decorate everything with "::")
|
||||
#[doc(hidden)]
|
||||
pub async fn failpoint_sleep_cancellable_helper(
|
||||
name: &'static str,
|
||||
duration_str: String,
|
||||
cancel: &CancellationToken,
|
||||
) {
|
||||
let millis = duration_str.parse::<u64>().unwrap();
|
||||
let d = std::time::Duration::from_millis(millis);
|
||||
|
||||
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
|
||||
tokio::time::timeout(d, cancel.cancelled()).await.ok();
|
||||
tracing::info!("failpoint {:?}: sleep done", name);
|
||||
}
|
||||
|
||||
pub fn init() -> fail::FailScenario<'static> {
|
||||
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
|
||||
// We want non-default behavior for `exit`, though, so, we handle it separately.
|
||||
//
|
||||
// Format for FAILPOINTS is "name=actions" separated by ";".
|
||||
let actions = std::env::var("FAILPOINTS");
|
||||
if actions.is_ok() {
|
||||
std::env::remove_var("FAILPOINTS");
|
||||
} else {
|
||||
// let the library handle non-utf8, or nothing for not present
|
||||
}
|
||||
|
||||
let scenario = fail::FailScenario::setup();
|
||||
|
||||
if let Ok(val) = actions {
|
||||
val.split(';')
|
||||
.enumerate()
|
||||
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
|
||||
.for_each(|res| {
|
||||
let (name, actions) = match res {
|
||||
Ok(t) => t,
|
||||
Err((i, s)) => {
|
||||
panic!(
|
||||
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
|
||||
i + 1,
|
||||
);
|
||||
}
|
||||
};
|
||||
if let Err(e) = apply_failpoint(name, actions) {
|
||||
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
scenario
|
||||
}
|
||||
|
||||
pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
||||
if actions == "exit" {
|
||||
fail::cfg_callback(name, exit_failpoint)
|
||||
} else {
|
||||
fail::cfg(name, actions)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn exit_failpoint() {
|
||||
tracing::info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
/// Configure failpoints through http.
|
||||
pub async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Cannot manage failpoints because storage was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -31,9 +31,6 @@ pub enum ApiError {
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
|
||||
#[error("Timeout")]
|
||||
Timeout(Cow<'static, str>),
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -70,10 +67,6 @@ impl ApiError {
|
||||
err.to_string(),
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
),
|
||||
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::REQUEST_TIMEOUT,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -83,10 +83,6 @@ pub mod timeout;
|
||||
|
||||
pub mod sync;
|
||||
|
||||
pub mod failpoint_support;
|
||||
|
||||
pub mod yielding_loop;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
@@ -366,49 +366,6 @@ impl MonotonicCounter<Lsn> for RecordLsn {
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
|
||||
///
|
||||
/// This is used by the `pagebench` pageserver benchmarking tool.
|
||||
pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
|
||||
|
||||
impl rand::distributions::uniform::SampleUniform for Lsn {
|
||||
type Sampler = LsnSampler;
|
||||
}
|
||||
|
||||
impl rand::distributions::uniform::UniformSampler for LsnSampler {
|
||||
type X = Lsn;
|
||||
|
||||
fn new<B1, B2>(low: B1, high: B2) -> Self
|
||||
where
|
||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
{
|
||||
Self(
|
||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
|
||||
low.borrow().0,
|
||||
high.borrow().0,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
|
||||
where
|
||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
{
|
||||
Self(
|
||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
|
||||
low.borrow().0,
|
||||
high.borrow().0,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
|
||||
Lsn(self.0.sample(rng))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bin_ser::BeSer;
|
||||
|
||||
@@ -15,12 +15,6 @@ pub struct Gate {
|
||||
name: String,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Gate {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Gate<{}>", self.name)
|
||||
}
|
||||
}
|
||||
|
||||
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||
/// not complete.
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum YieldingLoopError {
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
|
||||
/// yields to avoid blocking the executor, and after resuming checks the provided
|
||||
/// cancellation token to drop out promptly on shutdown.
|
||||
#[inline(always)]
|
||||
pub async fn yielding_loop<I, T, F>(
|
||||
interval: usize,
|
||||
cancel: &CancellationToken,
|
||||
iter: I,
|
||||
mut visitor: F,
|
||||
) -> Result<(), YieldingLoopError>
|
||||
where
|
||||
I: Iterator<Item = T>,
|
||||
F: FnMut(T),
|
||||
{
|
||||
for (i, item) in iter.enumerate() {
|
||||
visitor(item);
|
||||
|
||||
if i + 1 % interval == 0 {
|
||||
tokio::task::yield_now().await;
|
||||
if cancel.is_cancelled() {
|
||||
return Err(YieldingLoopError::Cancelled);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -446,11 +446,12 @@ impl Runner {
|
||||
if let Some(t) = self.last_upscale_request_at {
|
||||
let elapsed = t.elapsed();
|
||||
if elapsed < Duration::from_secs(1) {
|
||||
// *Ideally* we'd like to log here that we're ignoring the fact the
|
||||
// memory stats are too high, but in practice this can result in
|
||||
// spamming the logs with repetitive messages about ignoring the signal
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5865 for more.
|
||||
info!(
|
||||
elapsed_millis = elapsed.as_millis(),
|
||||
avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
|
||||
threshold = bytes_to_mebibytes(cgroup.threshold),
|
||||
"cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,2 +1 @@
|
||||
#include "postgres.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
@@ -8,12 +8,12 @@ use std::ffi::CString;
|
||||
|
||||
use crate::bindings::uint32;
|
||||
use crate::bindings::walproposer_api;
|
||||
use crate::bindings::NeonWALReadResult;
|
||||
use crate::bindings::PGAsyncReadResult;
|
||||
use crate::bindings::PGAsyncWriteResult;
|
||||
use crate::bindings::Safekeeper;
|
||||
use crate::bindings::Size;
|
||||
use crate::bindings::StringInfoData;
|
||||
use crate::bindings::TimeLineID;
|
||||
use crate::bindings::TimestampTz;
|
||||
use crate::bindings::WalProposer;
|
||||
use crate::bindings::WalProposerConnStatusType;
|
||||
@@ -178,11 +178,31 @@ extern "C" fn conn_blocking_write(
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
|
||||
extern "C" fn recovery_download(
|
||||
sk: *mut Safekeeper,
|
||||
_timeline: TimeLineID,
|
||||
startpos: XLogRecPtr,
|
||||
endpos: XLogRecPtr,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).recovery_download(&mut (*wp), &mut (*sk))
|
||||
(*api).recovery_download(&mut (*sk), startpos, endpos)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
) {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,28 +214,11 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
_errmsg: *mut *mut ::std::os::raw::c_char,
|
||||
) -> NeonWALReadResult {
|
||||
extern "C" fn free_event_set(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
// TODO: errmsg is not forwarded
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_reader_events(&mut (*sk))
|
||||
(*api).free_event_set(&mut (*wp));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,14 +238,6 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).active_state_update_event_set(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
@@ -251,14 +246,6 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).rm_safekeeper_event_set(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wait_event_set(
|
||||
wp: *mut WalProposer,
|
||||
timeout: ::std::os::raw::c_long,
|
||||
@@ -326,6 +313,14 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).confirm_wal_streamed(&mut (*wp), lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn log_internal(
|
||||
wp: *mut WalProposer,
|
||||
level: ::std::os::raw::c_int,
|
||||
@@ -340,6 +335,14 @@ extern "C" fn log_internal(
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn after_election(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).after_election(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Level {
|
||||
Debug5,
|
||||
@@ -398,20 +401,20 @@ pub(crate) fn create_api() -> walproposer_api {
|
||||
conn_async_write: Some(conn_async_write),
|
||||
conn_blocking_write: Some(conn_blocking_write),
|
||||
recovery_download: Some(recovery_download),
|
||||
wal_reader_allocate: Some(wal_reader_allocate),
|
||||
wal_read: Some(wal_read),
|
||||
wal_reader_events: Some(wal_reader_events),
|
||||
wal_reader_allocate: Some(wal_reader_allocate),
|
||||
free_event_set: Some(free_event_set),
|
||||
init_event_set: Some(init_event_set),
|
||||
update_event_set: Some(update_event_set),
|
||||
active_state_update_event_set: Some(active_state_update_event_set),
|
||||
add_safekeeper_event_set: Some(add_safekeeper_event_set),
|
||||
rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
|
||||
wait_event_set: Some(wait_event_set),
|
||||
strong_random: Some(strong_random),
|
||||
get_redo_start_lsn: Some(get_redo_start_lsn),
|
||||
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
||||
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
||||
confirm_wal_streamed: Some(confirm_wal_streamed),
|
||||
log_internal: Some(log_internal),
|
||||
after_election: Some(after_election),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
|
||||
use crate::{
|
||||
api_bindings::{create_api, take_vec_u8, Level},
|
||||
bindings::{
|
||||
NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
|
||||
WalProposerFree, WalProposerStart,
|
||||
Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
|
||||
WalProposerStart,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -86,19 +86,19 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
|
||||
fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
|
||||
fn free_event_set(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
@@ -110,18 +110,10 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
|
||||
todo!()
|
||||
}
|
||||
@@ -142,6 +134,10 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
|
||||
todo!()
|
||||
}
|
||||
@@ -244,7 +240,6 @@ impl Drop for Wrapper {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use core::panic;
|
||||
use std::{
|
||||
cell::Cell,
|
||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||
@@ -252,7 +247,7 @@ mod tests {
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||
use crate::{api_bindings::Level, walproposer::Wrapper};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
@@ -360,17 +355,12 @@ mod tests {
|
||||
true
|
||||
}
|
||||
|
||||
fn recovery_download(
|
||||
&self,
|
||||
_wp: &mut crate::bindings::WalProposer,
|
||||
_sk: &mut crate::bindings::Safekeeper,
|
||||
) -> bool {
|
||||
true
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("wal_reader_allocate")
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
|
||||
println!("wal_reader_allocate");
|
||||
crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
|
||||
fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("free_event_set")
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
@@ -393,13 +383,6 @@ mod tests {
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
|
||||
println!(
|
||||
"rm_safekeeper_event_set, sk={:?}",
|
||||
sk as *mut crate::bindings::Safekeeper
|
||||
);
|
||||
}
|
||||
|
||||
fn wait_event_set(
|
||||
&self,
|
||||
_: &mut crate::bindings::WalProposer,
|
||||
@@ -425,7 +408,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
||||
println!("wp_log[{}] {}", level, msg);
|
||||
println!("walprop_log[{}] {}", level, msg);
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
||||
|
||||
@@ -63,7 +63,6 @@ thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -13,7 +13,6 @@ use bytes::{Buf, Bytes};
|
||||
use pageserver::{
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
@@ -27,9 +26,9 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
|
||||
@@ -12,11 +12,3 @@ reqwest.workspace = true
|
||||
utils.workspace = true
|
||||
serde.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio.workspace = true
|
||||
futures.workspace = true
|
||||
tokio-util.workspace = true
|
||||
anyhow.workspace = true
|
||||
postgres.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
@@ -1,2 +1 @@
|
||||
pub mod mgmt_api;
|
||||
pub mod page_service;
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
use pageserver_api::{models::*, shard::TenantShardId};
|
||||
use pageserver_api::models::*;
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use utils::{
|
||||
http::error::HttpErrorBody,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
pub mod util;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Client {
|
||||
mgmt_api_endpoint: String,
|
||||
@@ -28,12 +26,14 @@ pub enum Error {
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
pub(crate) trait ResponseErrorMessageExt: Sized {
|
||||
#[async_trait::async_trait]
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
async fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(self) -> Result<Self> {
|
||||
async fn error_from_body(mut self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
@@ -49,11 +49,6 @@ impl ResponseErrorMessageExt for reqwest::Response {
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ForceAwaitLogicalSize {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||
Self {
|
||||
@@ -69,18 +64,6 @@ impl Client {
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_details(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<pageserver_api::models::TenantDetails> {
|
||||
let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn list_timelines(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
@@ -97,18 +80,11 @@ impl Client {
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
force_await_logical_size: ForceAwaitLogicalSize,
|
||||
) -> Result<pageserver_api::models::TimelineInfo> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
let uri = match force_await_logical_size {
|
||||
ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true),
|
||||
ForceAwaitLogicalSize::No => uri,
|
||||
};
|
||||
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
@@ -174,18 +150,6 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/secondary/download",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
);
|
||||
self.request(Method::POST, &uri, ())
|
||||
.await?
|
||||
.error_for_status()
|
||||
.map(|_| ())
|
||||
.map_err(|e| Error::ApiError(format!("{}", e)))
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
@@ -221,16 +185,4 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/reset",
|
||||
self.mgmt_api_endpoint, tenant_shard_id
|
||||
);
|
||||
self.request(Method::POST, &uri, ())
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
//! Helpers to do common higher-level tasks with the [`Client`].
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::{TenantId, TenantTimelineId};
|
||||
|
||||
use super::Client;
|
||||
|
||||
/// Retrieve a list of all of the pageserver's timelines.
|
||||
///
|
||||
/// Fails if there are sharded tenants present on the pageserver.
|
||||
pub async fn get_pageserver_tenant_timelines_unsharded(
|
||||
api_client: &Arc<Client>,
|
||||
) -> anyhow::Result<Vec<TenantTimelineId>> {
|
||||
let mut timelines: Vec<TenantTimelineId> = Vec::new();
|
||||
let mut tenants: Vec<TenantId> = Vec::new();
|
||||
for ti in api_client.list_tenants().await? {
|
||||
if !ti.id.is_unsharded() {
|
||||
anyhow::bail!(
|
||||
"only unsharded tenants are supported at this time: {}",
|
||||
ti.id
|
||||
);
|
||||
}
|
||||
tenants.push(ti.id.tenant_id)
|
||||
}
|
||||
let mut js = JoinSet::new();
|
||||
for tenant_id in tenants {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(api_client);
|
||||
async move {
|
||||
(
|
||||
tenant_id,
|
||||
mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
|
||||
)
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(res) = js.join_next().await {
|
||||
let (tenant_id, details) = res.unwrap();
|
||||
for timeline_id in details.timelines {
|
||||
timelines.push(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(timelines)
|
||||
}
|
||||
@@ -1,144 +0,0 @@
|
||||
use std::pin::Pin;
|
||||
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
PagestreamGetPageResponse,
|
||||
},
|
||||
reltag::RelTag,
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub struct Client {
|
||||
client: tokio_postgres::Client,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
|
||||
pub struct BasebackupRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub lsn: Option<Lsn>,
|
||||
pub gzip: bool,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub async fn new(connstring: String) -> anyhow::Result<Self> {
|
||||
let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?;
|
||||
|
||||
let conn_task_cancel = CancellationToken::new();
|
||||
let conn_task = tokio::spawn({
|
||||
let conn_task_cancel = conn_task_cancel.clone();
|
||||
async move {
|
||||
tokio::select! {
|
||||
_ = conn_task_cancel.cancelled() => { }
|
||||
res = connection => {
|
||||
res.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
Ok(Self {
|
||||
cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
|
||||
conn_task,
|
||||
client,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn pagestream(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<PagestreamClient> {
|
||||
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
|
||||
.client
|
||||
.copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
|
||||
.await?;
|
||||
let Client {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
client: _,
|
||||
} = self;
|
||||
Ok(PagestreamClient {
|
||||
copy_both: Box::pin(copy_both),
|
||||
conn_task,
|
||||
cancel_on_client_drop,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result<CopyOutStream> {
|
||||
let BasebackupRequest {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
} = req;
|
||||
let mut args = Vec::with_capacity(5);
|
||||
args.push("basebackup".to_string());
|
||||
args.push(format!("{tenant_id}"));
|
||||
args.push(format!("{timeline_id}"));
|
||||
if let Some(lsn) = lsn {
|
||||
args.push(format!("{lsn}"));
|
||||
}
|
||||
if *gzip {
|
||||
args.push("--gzip".to_string())
|
||||
}
|
||||
Ok(self.client.copy_out(&args.join(" ")).await?)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create using [`Client::pagestream`].
|
||||
pub struct PagestreamClient {
|
||||
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
|
||||
pub struct RelTagBlockNo {
|
||||
pub rel_tag: RelTag,
|
||||
pub block_no: u32,
|
||||
}
|
||||
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(mut self) {
|
||||
let _ = self.cancel_on_client_drop.take();
|
||||
self.conn_task.await.unwrap();
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
req: PagestreamGetPageRequest,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
let next: bytes::Bytes = next.unwrap()?;
|
||||
|
||||
let msg = PagestreamBeMessage::deserialize(next)?;
|
||||
match msg {
|
||||
PagestreamBeMessage::GetPage(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::Exists(_)
|
||||
| PagestreamBeMessage::Nblocks(_)
|
||||
| PagestreamBeMessage::DbSize(_) => {
|
||||
anyhow::bail!(
|
||||
"unexpected be message kind in response to getpage request: {}",
|
||||
msg.kind()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,28 +0,0 @@
|
||||
[package]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
hdrhistogram.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
pageserver = { path = ".." }
|
||||
pageserver_client.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
utils = { path = "../../libs/utils/" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -1,275 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||
use pageserver_client::page_service::BasebackupRequest;
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, info, instrument};
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
|
||||
/// basebackup@LatestLSN
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long, default_value = "1.0")]
|
||||
gzip_probability: f64,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
struct Target {
|
||||
timeline: TenantTimelineId,
|
||||
lsn_range: Option<Range<Lsn>>,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: request_stats::Output,
|
||||
}
|
||||
|
||||
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
|
||||
main_impl(args, thread_local_stats)
|
||||
})
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let mut js = JoinSet::new();
|
||||
for timeline in &timelines {
|
||||
js.spawn({
|
||||
let timeline = *timeline;
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(
|
||||
timeline.tenant_id,
|
||||
timeline.timeline_id,
|
||||
ForceAwaitLogicalSize::No,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
async move {
|
||||
anyhow::Ok(Target {
|
||||
timeline,
|
||||
// TODO: support lsn_range != latest LSN
|
||||
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
|
||||
})
|
||||
}
|
||||
});
|
||||
}
|
||||
let mut all_targets: Vec<Target> = Vec::new();
|
||||
while let Some(res) = js.join_next().await {
|
||||
all_targets.push(res.unwrap().unwrap());
|
||||
}
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
|
||||
));
|
||||
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
|
||||
|
||||
tokio::spawn({
|
||||
let stats = Arc::clone(&live_stats);
|
||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
||||
async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let mut work_senders = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender = async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let (timeline, work) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let target = all_targets.choose(&mut rng).unwrap();
|
||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
||||
(
|
||||
target.timeline,
|
||||
Work {
|
||||
lsn,
|
||||
gzip: rng.gen_bool(args.gzip_probability),
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
sender.send(work).await.ok().unwrap();
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(runtime) = args.runtime {
|
||||
match tokio::time::timeout(runtime.into(), work_sender).await {
|
||||
Ok(()) => unreachable!("work sender never terminates"),
|
||||
Err(_timeout) => {
|
||||
// this implicitly drops the work_senders, making all the clients exit
|
||||
}
|
||||
}
|
||||
} else {
|
||||
work_sender.await;
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = request_stats::Stats::new();
|
||||
for stats in all_thread_local_stats.lock().unwrap().iter() {
|
||||
let stats = stats.lock().unwrap();
|
||||
agg_stats.add(&stats);
|
||||
}
|
||||
agg_stats.output()
|
||||
},
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct Work {
|
||||
lsn: Option<Lsn>,
|
||||
gzip: bool,
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
|
||||
&args.page_service_host_port,
|
||||
args.pageserver_jwt.as_deref(),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
let copy_out_stream = client
|
||||
.basebackup(&BasebackupRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
})
|
||||
.await
|
||||
.with_context(|| format!("start basebackup for {timeline}"))
|
||||
.unwrap();
|
||||
|
||||
use futures::StreamExt;
|
||||
let size = Arc::new(AtomicUsize::new(0));
|
||||
copy_out_stream
|
||||
.for_each({
|
||||
|r| {
|
||||
let size = Arc::clone(&size);
|
||||
async move {
|
||||
let size = Arc::clone(&size);
|
||||
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
@@ -1,428 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::future::join_all;
|
||||
use pageserver::pgdatadir_mapping::key_to_rel_block;
|
||||
use pageserver::repository;
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
|
||||
use crate::util::{request_stats, tokio_thread_local_stats};
|
||||
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long)]
|
||||
per_target_rate_limit: Option<usize>,
|
||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||
#[clap(long, default_value = "1")]
|
||||
req_latest_probability: f64,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
/// For large pageserver installations, enumerating the keyspace takes a lot of time.
|
||||
/// If specified, the specified path is used to maintain a cache of the keyspace enumeration result.
|
||||
/// The cache is tagged and auto-invalided by the tenant/timeline ids only.
|
||||
/// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
|
||||
#[clap(long)]
|
||||
keyspace_cache: Option<Utf8PathBuf>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, serde::Serialize, serde::Deserialize)]
|
||||
struct KeyRange {
|
||||
timeline: TenantTimelineId,
|
||||
timeline_lsn: Lsn,
|
||||
start: i128,
|
||||
end: i128,
|
||||
}
|
||||
|
||||
impl KeyRange {
|
||||
fn len(&self) -> i128 {
|
||||
self.end - self.start
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: request_stats::Output,
|
||||
}
|
||||
|
||||
tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
|
||||
main_impl(args, thread_local_stats)
|
||||
})
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct KeyspaceCacheDe {
|
||||
tag: Vec<TenantTimelineId>,
|
||||
data: Vec<KeyRange>,
|
||||
}
|
||||
#[derive(serde::Serialize)]
|
||||
struct KeyspaceCacheSer<'a> {
|
||||
tag: &'a [TenantTimelineId],
|
||||
data: &'a [KeyRange],
|
||||
}
|
||||
let cache = args
|
||||
.keyspace_cache
|
||||
.as_ref()
|
||||
.map(|keyspace_cache_file| {
|
||||
let contents = match std::fs::read(keyspace_cache_file) {
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
return anyhow::Ok(None);
|
||||
}
|
||||
x => x.context("read keyspace cache file")?,
|
||||
};
|
||||
let cache: KeyspaceCacheDe =
|
||||
serde_json::from_slice(&contents).context("deserialize cache file")?;
|
||||
let tag_ok = HashSet::<TenantTimelineId>::from_iter(cache.tag.into_iter())
|
||||
== HashSet::from_iter(timelines.iter().cloned());
|
||||
info!("keyspace cache file matches tag: {tag_ok}");
|
||||
anyhow::Ok(if tag_ok { Some(cache.data) } else { None })
|
||||
})
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let all_ranges: Vec<KeyRange> = if let Some(cached) = cache {
|
||||
info!("using keyspace cache file");
|
||||
cached
|
||||
} else {
|
||||
let mut js = JoinSet::new();
|
||||
for timeline in &timelines {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
let timeline = *timeline;
|
||||
async move {
|
||||
let partitioning = mgmt_api_client
|
||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
||||
.await?;
|
||||
let lsn = partitioning.at_lsn;
|
||||
let start = Instant::now();
|
||||
let mut filtered = KeySpaceAccum::new();
|
||||
// let's hope this is inlined and vectorized...
|
||||
// TODO: turn this loop into a is_rel_block_range() function.
|
||||
for r in partitioning.keys.ranges.iter() {
|
||||
let mut i = r.start;
|
||||
while i != r.end {
|
||||
if is_rel_block_key(&i) {
|
||||
filtered.add_key(i);
|
||||
}
|
||||
i = i.next();
|
||||
}
|
||||
}
|
||||
let filtered = filtered.to_keyspace();
|
||||
let filter_duration = start.elapsed();
|
||||
|
||||
anyhow::Ok((
|
||||
filter_duration,
|
||||
filtered.ranges.into_iter().map(move |r| KeyRange {
|
||||
timeline,
|
||||
timeline_lsn: lsn,
|
||||
start: r.start.to_i128(),
|
||||
end: r.end.to_i128(),
|
||||
}),
|
||||
))
|
||||
}
|
||||
});
|
||||
}
|
||||
let mut total_filter_duration = Duration::from_secs(0);
|
||||
let mut all_ranges: Vec<KeyRange> = Vec::new();
|
||||
while let Some(res) = js.join_next().await {
|
||||
let (filter_duration, range) = res.unwrap().unwrap();
|
||||
all_ranges.extend(range);
|
||||
total_filter_duration += filter_duration;
|
||||
}
|
||||
info!("filter duration: {}", total_filter_duration.as_secs_f64());
|
||||
if let Some(cachefile) = args.keyspace_cache.as_ref() {
|
||||
let cache = KeyspaceCacheSer {
|
||||
tag: &timelines,
|
||||
data: &all_ranges,
|
||||
};
|
||||
let bytes = serde_json::to_vec(&cache).context("serialize keyspace for cache file")?;
|
||||
std::fs::write(cachefile, bytes).context("write keyspace cache file to disk")?;
|
||||
info!("successfully wrote keyspace cache file");
|
||||
}
|
||||
all_ranges
|
||||
};
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
let num_main_impl = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
|
||||
));
|
||||
|
||||
tokio::spawn({
|
||||
let stats = Arc::clone(&live_stats);
|
||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
||||
async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(*tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&live_stats),
|
||||
cancel.clone(),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
|
||||
let start_work_barrier = start_work_barrier.clone();
|
||||
let cancel = cancel.clone();
|
||||
match args.per_target_rate_limit {
|
||||
None => Box::pin(async move {
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
all_ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
let (timeline, req) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &all_ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
(
|
||||
r.timeline,
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
if sender.send(req).await.is_err() {
|
||||
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
|
||||
}
|
||||
}
|
||||
}),
|
||||
Some(rps_limit) => Box::pin(async move {
|
||||
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
|
||||
let make_timeline_task: &dyn Fn(
|
||||
TenantTimelineId,
|
||||
)
|
||||
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let cancel = cancel.clone();
|
||||
Box::pin(async move {
|
||||
let mut ticker = tokio::time::interval(period);
|
||||
ticker.set_missed_tick_behavior(
|
||||
/* TODO review this choice */
|
||||
tokio::time::MissedTickBehavior::Burst,
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
ticker.tick().await;
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
if sender.send(req).await.is_err() {
|
||||
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
let tasks: Vec<_> = work_senders
|
||||
.keys()
|
||||
.map(|tl| make_timeline_task(*tl))
|
||||
.collect();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
join_all(tasks).await;
|
||||
}),
|
||||
}
|
||||
};
|
||||
|
||||
let work_sender_task = tokio::spawn(work_sender);
|
||||
|
||||
if let Some(runtime) = args.runtime {
|
||||
info!("waiting for everything to become ready");
|
||||
start_work_barrier.wait().await;
|
||||
info!("work started");
|
||||
tokio::time::sleep(runtime.into()).await;
|
||||
info!("runtime over, signalling cancellation");
|
||||
cancel.cancel();
|
||||
work_sender_task.await.unwrap();
|
||||
info!("work sender exited");
|
||||
} else {
|
||||
work_sender_task.await.unwrap();
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
info!("joining clients");
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
info!("all clients stopped");
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = request_stats::Stats::new();
|
||||
for stats in all_thread_local_stats.lock().unwrap().iter() {
|
||||
let stats = stats.lock().unwrap();
|
||||
agg_stats.add(&stats);
|
||||
}
|
||||
agg_stats.output()
|
||||
},
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while let Some(req) =
|
||||
tokio::select! { work = work.recv() => { work } , _ = cancel.cancelled() => { return; } }
|
||||
{
|
||||
let start = Instant::now();
|
||||
|
||||
let res = tokio::select! {
|
||||
res = client.getpage(req) => { res },
|
||||
_ = cancel.cancelled() => { return; }
|
||||
};
|
||||
res.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -1,88 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use humantime::Duration;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
|
||||
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(
|
||||
long,
|
||||
help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
|
||||
)]
|
||||
poll_for_completion: Option<Duration>,
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(args));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
// kick it off
|
||||
|
||||
let mut js = JoinSet::new();
|
||||
for tl in timelines {
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
js.spawn(async move {
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Polling should not be strictly required here since we await
|
||||
// for the initial logical size, however it's possible for the request
|
||||
// to land before the timeline is initialised. This results in an approximate
|
||||
// logical size.
|
||||
if let Some(period) = args.poll_for_completion {
|
||||
let mut ticker = tokio::time::interval(period.into());
|
||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
||||
let mut info = info;
|
||||
while !info.current_logical_size_is_accurate {
|
||||
ticker.tick().await;
|
||||
info = mgmt_api_client
|
||||
.timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(res) = js.join_next().await {
|
||||
let _: () = res.unwrap();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
use clap::Parser;
|
||||
use utils::logging;
|
||||
|
||||
/// Re-usable pieces of code that aren't CLI-specific.
|
||||
mod util {
|
||||
pub(crate) mod connstring;
|
||||
pub(crate) mod request_stats;
|
||||
#[macro_use]
|
||||
pub(crate) mod tokio_thread_local_stats;
|
||||
/// Re-usable pieces of CLI-specific code.
|
||||
pub(crate) mod cli {
|
||||
pub(crate) mod targets;
|
||||
}
|
||||
}
|
||||
|
||||
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
|
||||
mod cmd {
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod trigger_initial_size_calculation;
|
||||
}
|
||||
|
||||
/// Component-level performance test for pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
enum Args {
|
||||
Basebackup(cmd::basebackup::Args),
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let args = Args::parse();
|
||||
match args {
|
||||
Args::Basebackup(args) => cmd::basebackup::main(args),
|
||||
Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
|
||||
Args::TriggerInitialSizeCalculation(args) => {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use pageserver_client::mgmt_api;
|
||||
use tracing::info;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
pub(crate) struct Spec {
|
||||
pub(crate) limit_to_first_n_targets: Option<usize>,
|
||||
pub(crate) targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) async fn discover(
|
||||
api_client: &Arc<mgmt_api::Client>,
|
||||
spec: Spec,
|
||||
) -> anyhow::Result<Vec<TenantTimelineId>> {
|
||||
let mut timelines = if let Some(targets) = spec.targets {
|
||||
targets
|
||||
} else {
|
||||
mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
|
||||
};
|
||||
|
||||
if let Some(limit) = spec.limit_to_first_n_targets {
|
||||
timelines.sort(); // for determinism
|
||||
timelines.truncate(limit);
|
||||
if timelines.len() < limit {
|
||||
anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
|
||||
}
|
||||
}
|
||||
|
||||
info!("timelines:\n{:?}", timelines);
|
||||
info!("number of timelines:\n{:?}", timelines.len());
|
||||
|
||||
Ok(timelines)
|
||||
}
|
||||
@@ -1,8 +0,0 @@
|
||||
pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
|
||||
let colon_and_jwt = if let Some(jwt) = jwt {
|
||||
format!(":{jwt}") // TODO: urlescape
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
format!("postgres://postgres{colon_and_jwt}@{host_port}")
|
||||
}
|
||||
@@ -1,88 +0,0 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
pub(crate) struct Stats {
|
||||
latency_histo: hdrhistogram::Histogram<u64>,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
// Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
|
||||
// which would skew the benchmark results.
|
||||
latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
|
||||
let micros: u64 = latency
|
||||
.as_micros()
|
||||
.try_into()
|
||||
.context("latency greater than u64")?;
|
||||
self.latency_histo
|
||||
.record(micros)
|
||||
.context("add to histogram")?;
|
||||
Ok(())
|
||||
}
|
||||
pub(crate) fn output(&self) -> Output {
|
||||
let latency_percentiles = std::array::from_fn(|idx| {
|
||||
let micros = self
|
||||
.latency_histo
|
||||
.value_at_percentile(LATENCY_PERCENTILES[idx]);
|
||||
Duration::from_micros(micros)
|
||||
});
|
||||
Output {
|
||||
request_count: self.latency_histo.len(),
|
||||
latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
|
||||
latency_percentiles: LatencyPercentiles {
|
||||
latency_percentiles,
|
||||
},
|
||||
}
|
||||
}
|
||||
pub(crate) fn add(&mut self, other: &Self) {
|
||||
let Self {
|
||||
ref mut latency_histo,
|
||||
} = self;
|
||||
latency_histo.add(&other.latency_histo).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Stats {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
|
||||
|
||||
struct LatencyPercentiles {
|
||||
latency_percentiles: [Duration; 4],
|
||||
}
|
||||
|
||||
impl serde::Serialize for LatencyPercentiles {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
|
||||
for p in LATENCY_PERCENTILES {
|
||||
ser.serialize_entry(
|
||||
&format!("p{p}"),
|
||||
&format!(
|
||||
"{}",
|
||||
&humantime::format_duration(self.latency_percentiles[0])
|
||||
),
|
||||
)?;
|
||||
}
|
||||
ser.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub(crate) struct Output {
|
||||
request_count: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
latency_mean: Duration,
|
||||
latency_percentiles: LatencyPercentiles,
|
||||
}
|
||||
@@ -1,45 +0,0 @@
|
||||
pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
|
||||
pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
|
||||
|
||||
macro_rules! declare {
|
||||
($THREAD_LOCAL_NAME:ident: $T:ty) => {
|
||||
thread_local! {
|
||||
pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
|
||||
std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
|
||||
);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
pub(crate) use declare;
|
||||
|
||||
macro_rules! main {
|
||||
($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
|
||||
let main_impl = $main_impl;
|
||||
let all = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.on_thread_start({
|
||||
let all = Arc::clone(&all);
|
||||
move || {
|
||||
// pre-initialize the thread local stats by accessesing them
|
||||
// (some stats like requests_stats::Stats are quite costly to initialize,
|
||||
// we don't want to pay that cost during the measurement period)
|
||||
$THREAD_LOCAL_NAME.with(|stats| {
|
||||
let stats: Arc<_> = Arc::clone(&*stats.borrow());
|
||||
all.lock().unwrap().push(stats);
|
||||
});
|
||||
}
|
||||
})
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(all));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}};
|
||||
}
|
||||
|
||||
pub(crate) use main;
|
||||
@@ -23,7 +23,6 @@ use tracing::*;
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
@@ -175,7 +174,7 @@ where
|
||||
] {
|
||||
for segno in self
|
||||
.timeline
|
||||
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
|
||||
.list_slru_segments(kind, self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
@@ -193,7 +192,7 @@ where
|
||||
// Otherwise only include init forks of unlogged relations.
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
@@ -268,7 +267,7 @@ where
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
|
||||
.get_rel_size(src, self.lsn, false, self.ctx)
|
||||
.await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
@@ -289,7 +288,7 @@ where
|
||||
for blknum in startblk..endblk {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
|
||||
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
|
||||
.await?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
@@ -311,7 +310,7 @@ where
|
||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
|
||||
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
@@ -353,7 +352,7 @@ where
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?;
|
||||
|
||||
ensure!(
|
||||
@@ -400,7 +399,7 @@ where
|
||||
if !has_relmap_file
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?
|
||||
.is_empty()
|
||||
{
|
||||
|
||||
@@ -31,7 +31,6 @@ use pageserver::{
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::failpoint_support;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
@@ -127,7 +126,7 @@ fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = failpoint_support::init();
|
||||
let scenario = pageserver::failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
|
||||
@@ -37,8 +37,8 @@ use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
|
||||
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
@@ -75,9 +75,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||
|
||||
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
||||
pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
@@ -91,7 +88,6 @@ pub mod defaults {
|
||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||
|
||||
#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
|
||||
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
||||
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
@@ -112,8 +108,6 @@ pub mod defaults {
|
||||
|
||||
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
||||
|
||||
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -131,7 +125,6 @@ pub mod defaults {
|
||||
#gc_feedback = false
|
||||
|
||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
||||
|
||||
[remote_storage]
|
||||
|
||||
@@ -240,13 +233,6 @@ pub struct PageServerConf {
|
||||
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
|
||||
/// heatmap uploads vs. other remote storage operations.
|
||||
pub heatmap_upload_concurrency: usize,
|
||||
|
||||
/// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly
|
||||
/// deprioritises secondary downloads vs. remote storage operations for attached tenants.
|
||||
pub secondary_download_concurrency: usize,
|
||||
|
||||
/// Maximum number of WAL records to be ingested and committed at the same time
|
||||
pub ingest_batch_size: u64,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -328,9 +314,6 @@ struct PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: BuilderValue<bool>,
|
||||
|
||||
heatmap_upload_concurrency: BuilderValue<usize>,
|
||||
secondary_download_concurrency: BuilderValue<usize>,
|
||||
|
||||
ingest_batch_size: BuilderValue<u64>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -403,9 +386,6 @@ impl Default for PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: Set(false),
|
||||
|
||||
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
||||
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
||||
|
||||
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -554,14 +534,6 @@ impl PageServerConfigBuilder {
|
||||
self.heatmap_upload_concurrency = BuilderValue::Set(value)
|
||||
}
|
||||
|
||||
pub fn secondary_download_concurrency(&mut self, value: usize) {
|
||||
self.secondary_download_concurrency = BuilderValue::Set(value)
|
||||
}
|
||||
|
||||
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
|
||||
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
@@ -660,15 +632,10 @@ impl PageServerConfigBuilder {
|
||||
control_plane_emergency_mode: self
|
||||
.control_plane_emergency_mode
|
||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||
|
||||
heatmap_upload_concurrency: self
|
||||
.heatmap_upload_concurrency
|
||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
||||
secondary_download_concurrency: self
|
||||
.secondary_download_concurrency
|
||||
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
|
||||
ingest_batch_size: self
|
||||
.ingest_batch_size
|
||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -726,11 +693,6 @@ impl PageServerConf {
|
||||
.join(TENANT_LOCATION_CONFIG_NAME)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_HEATMAP_BASENAME)
|
||||
}
|
||||
|
||||
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TIMELINES_SEGMENT_NAME)
|
||||
@@ -916,10 +878,6 @@ impl PageServerConf {
|
||||
"heatmap_upload_concurrency" => {
|
||||
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
|
||||
},
|
||||
"secondary_download_concurrency" => {
|
||||
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
|
||||
},
|
||||
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -991,8 +949,6 @@ impl PageServerConf {
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1221,9 +1177,7 @@ background_task_maximum_delay = '334 s'
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1284,9 +1238,7 @@ background_task_maximum_delay = '334 s'
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: 100,
|
||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -1516,7 +1468,6 @@ threshold = "20m"
|
||||
period: Duration::from_secs(10),
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
|
||||
})
|
||||
);
|
||||
match &conf.default_tenant_conf.eviction_policy {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use futures::Future;
|
||||
use pageserver_api::{
|
||||
control_api::{
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
@@ -29,14 +28,13 @@ pub enum RetryForeverError {
|
||||
ShuttingDown,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait ControlPlaneGenerationsApi {
|
||||
fn re_attach(
|
||||
&self,
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
||||
fn validate(
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
|
||||
async fn validate(
|
||||
&self,
|
||||
tenants: Vec<(TenantShardId, Generation)>,
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
|
||||
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
|
||||
}
|
||||
|
||||
impl ControlPlaneClient {
|
||||
@@ -125,6 +123,7 @@ impl ControlPlaneClient {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
|
||||
@@ -831,6 +831,7 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
|
||||
@@ -74,45 +74,6 @@ pub struct DiskUsageEvictionTaskConfig {
|
||||
pub period: Duration,
|
||||
#[cfg(feature = "testing")]
|
||||
pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
|
||||
/// Select sorting for evicted layers
|
||||
#[serde(default)]
|
||||
pub eviction_order: EvictionOrder,
|
||||
}
|
||||
|
||||
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
|
||||
/// partitioning.
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", content = "args")]
|
||||
pub enum EvictionOrder {
|
||||
/// Order the layers to be evicted by how recently they have been accessed in absolute
|
||||
/// time.
|
||||
///
|
||||
/// This strategy is unfair when some tenants grow faster than others towards the slower
|
||||
/// growing.
|
||||
#[default]
|
||||
AbsoluteAccessed,
|
||||
|
||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
||||
/// the set of resident layers of a tenant.
|
||||
///
|
||||
/// This strategy will evict layers more fairly but is untested.
|
||||
RelativeAccessed {
|
||||
#[serde(default)]
|
||||
highest_layer_count_loses_first: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl EvictionOrder {
|
||||
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
|
||||
/// counts should be the first ones to have their layers evicted.
|
||||
fn highest_layer_count_loses_first(&self) -> bool {
|
||||
match self {
|
||||
EvictionOrder::AbsoluteAccessed => false,
|
||||
EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first,
|
||||
} => *highest_layer_count_loses_first,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -231,14 +192,7 @@ async fn disk_usage_eviction_task_iteration(
|
||||
) -> anyhow::Result<()> {
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
let res = disk_usage_eviction_task_iteration_impl(
|
||||
state,
|
||||
storage,
|
||||
usage_pre,
|
||||
task_config.eviction_order,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
@@ -324,7 +278,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
state: &State,
|
||||
_storage: &GenericRemoteStorage,
|
||||
usage_pre: U,
|
||||
eviction_order: EvictionOrder,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<IterationOutcome<U>> {
|
||||
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
|
||||
@@ -344,7 +297,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
"running disk usage based eviction due to pressure"
|
||||
);
|
||||
|
||||
let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
|
||||
let candidates = match collect_eviction_candidates(cancel).await? {
|
||||
EvictionCandidates::Cancelled => {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
@@ -354,16 +307,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// Debug-log the list of candidates
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
let nth = i + 1;
|
||||
let desc = candidate.layer.layer_desc();
|
||||
let total_candidates = candidates.len();
|
||||
let size = desc.file_size;
|
||||
let rel = candidate.relative_last_activity;
|
||||
debug!(
|
||||
"cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
|
||||
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
desc.file_size,
|
||||
now.duration_since(candidate.last_activity_ts)
|
||||
.unwrap()
|
||||
.as_micros(),
|
||||
partition,
|
||||
desc.tenant_shard_id,
|
||||
desc.timeline_id,
|
||||
candidate.layer,
|
||||
@@ -506,7 +459,6 @@ struct EvictionCandidate {
|
||||
timeline: Arc<Timeline>,
|
||||
layer: Layer,
|
||||
last_activity_ts: SystemTime,
|
||||
relative_last_activity: finite_f32::FiniteF32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
@@ -526,24 +478,24 @@ enum EvictionCandidates {
|
||||
/// order. A caller that evicts in that order, until pressure is relieved, implements
|
||||
/// the eviction policy outlined in the module comment.
|
||||
///
|
||||
/// # Example with EvictionOrder::AbsoluteAccessed
|
||||
/// # Example
|
||||
///
|
||||
/// Imagine that there are two tenants, A and B, with five layers each, a-e.
|
||||
/// Each layer has size 100, and both tenant's min_resident_size is 150.
|
||||
/// The eviction order would be
|
||||
///
|
||||
/// ```text
|
||||
/// partition last_activity_ts tenant/layer
|
||||
/// Above 18:30 A/c
|
||||
/// Above 19:00 A/b
|
||||
/// Above 18:29 B/c
|
||||
/// Above 19:05 B/b
|
||||
/// Above 20:00 B/a
|
||||
/// Above 20:03 A/a
|
||||
/// Below 20:30 A/d
|
||||
/// Below 20:40 B/d
|
||||
/// Below 20:45 B/e
|
||||
/// Below 20:58 A/e
|
||||
/// partition last_activity_ts tenant/layer
|
||||
/// Above 18:30 A/c
|
||||
/// Above 19:00 A/b
|
||||
/// Above 18:29 B/c
|
||||
/// Above 19:05 B/b
|
||||
/// Above 20:00 B/a
|
||||
/// Above 20:03 A/a
|
||||
/// Below 20:30 A/d
|
||||
/// Below 20:40 B/d
|
||||
/// Below 20:45 B/e
|
||||
/// Below 20:58 A/e
|
||||
/// ```
|
||||
///
|
||||
/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
|
||||
@@ -553,77 +505,7 @@ enum EvictionCandidates {
|
||||
/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
|
||||
/// after exhauting the `Above` partition.
|
||||
/// So, we did not respect each tenant's min_resident_size.
|
||||
///
|
||||
/// # Example with EvictionOrder::RelativeAccessed
|
||||
///
|
||||
/// ```text
|
||||
/// partition relative_age last_activity_ts tenant/layer
|
||||
/// Above 0/4 18:30 A/c
|
||||
/// Above 0/4 18:29 B/c
|
||||
/// Above 1/4 19:00 A/b
|
||||
/// Above 1/4 19:05 B/b
|
||||
/// Above 2/4 20:00 B/a
|
||||
/// Above 2/4 20:03 A/a
|
||||
/// Below 3/4 20:30 A/d
|
||||
/// Below 3/4 20:40 B/d
|
||||
/// Below 4/4 20:45 B/e
|
||||
/// Below 4/4 20:58 A/e
|
||||
/// ```
|
||||
///
|
||||
/// With tenants having the same number of layers the picture does not change much. The same with
|
||||
/// A having many more layers **resident** (not all of them listed):
|
||||
///
|
||||
/// ```text
|
||||
/// Above 0/100 18:30 A/c
|
||||
/// Above 0/4 18:29 B/c
|
||||
/// Above 1/100 19:00 A/b
|
||||
/// Above 2/100 20:03 A/a
|
||||
/// Above 3/100 20:03 A/nth_3
|
||||
/// Above 4/100 20:03 A/nth_4
|
||||
/// ...
|
||||
/// Above 1/4 19:05 B/b
|
||||
/// Above 25/100 20:04 A/nth_25
|
||||
/// ...
|
||||
/// Above 2/4 20:00 B/a
|
||||
/// Above 50/100 20:10 A/nth_50
|
||||
/// ...
|
||||
/// Below 3/4 20:40 B/d
|
||||
/// Below 99/100 20:30 A/nth_99
|
||||
/// Below 4/4 20:45 B/e
|
||||
/// Below 100/100 20:58 A/nth_100
|
||||
/// ```
|
||||
///
|
||||
/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
|
||||
/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
|
||||
/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
|
||||
/// appeared:
|
||||
///
|
||||
/// ```text
|
||||
/// Above 0/87 20:04 A/nth_23
|
||||
/// Above 0/3 19:05 B/b
|
||||
/// Above 0/50 20:59 C/nth_0
|
||||
/// Above 1/87 20:04 A/nth_24
|
||||
/// Above 1/50 21:00 C/nth_1
|
||||
/// Above 2/87 20:04 A/nth_25
|
||||
/// ...
|
||||
/// Above 16/50 21:02 C/nth_16
|
||||
/// Above 1/3 20:00 B/a
|
||||
/// Above 27/87 20:10 A/nth_50
|
||||
/// ...
|
||||
/// Below 2/3 20:40 B/d
|
||||
/// Below 49/50 21:05 C/nth_49
|
||||
/// Below 86/87 20:30 A/nth_99
|
||||
/// Below 3/3 20:45 B/e
|
||||
/// Below 50/50 21:05 C/nth_50
|
||||
/// Below 87/87 20:58 A/nth_100
|
||||
/// ```
|
||||
///
|
||||
/// Now relieving pressure with 23 layers would cost:
|
||||
/// - tenant A 14 layers
|
||||
/// - tenant B 1 layer
|
||||
/// - tenant C 8 layers
|
||||
async fn collect_eviction_candidates(
|
||||
eviction_order: EvictionOrder,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<EvictionCandidates> {
|
||||
// get a snapshot of the list of tenants
|
||||
@@ -709,63 +591,12 @@ async fn collect_eviction_candidates(
|
||||
tenant_candidates
|
||||
.sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
let mut cumsum: i128 = 0;
|
||||
|
||||
// keeping the -1 or not decides if every tenant should lose their least recently accessed
|
||||
// layer OR if this should happen in the order of having highest layer count:
|
||||
let fudge = if eviction_order.highest_layer_count_loses_first() {
|
||||
// relative_age vs. tenant layer count:
|
||||
// - 0.1..=1.0 (10 layers)
|
||||
// - 0.01..=1.0 (100 layers)
|
||||
// - 0.001..=1.0 (1000 layers)
|
||||
//
|
||||
// leading to evicting less of the smallest tenants.
|
||||
0
|
||||
} else {
|
||||
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
|
||||
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
|
||||
// be that less than 10k layer evictions is enough, so we would not need to evict from
|
||||
// all tenants.
|
||||
//
|
||||
// as the tenant ordering is now deterministic this could hit the same tenants
|
||||
// disproportionetly on multiple invocations. alternative could be to remember how many
|
||||
// layers did we evict last time from this tenant, and inject that as an additional
|
||||
// fudge here.
|
||||
1
|
||||
};
|
||||
|
||||
let total = tenant_candidates
|
||||
.len()
|
||||
.checked_sub(fudge)
|
||||
.filter(|&x| x > 0)
|
||||
// support 0 or 1 resident layer tenants as well
|
||||
.unwrap_or(1);
|
||||
let divider = total as f32;
|
||||
|
||||
for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
|
||||
for (timeline, layer_info) in tenant_candidates.into_iter() {
|
||||
let file_size = layer_info.file_size();
|
||||
|
||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
||||
// be 1.0; this is for us to evict it last.
|
||||
let relative_last_activity = if matches!(
|
||||
eviction_order,
|
||||
EvictionOrder::RelativeAccessed { .. }
|
||||
) {
|
||||
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
|
||||
// similarly for u16. unsure how it would help.
|
||||
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
|
||||
.unwrap_or_else(|val| {
|
||||
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
|
||||
finite_f32::FiniteF32::ZERO
|
||||
})
|
||||
} else {
|
||||
finite_f32::FiniteF32::ZERO
|
||||
};
|
||||
|
||||
let candidate = EvictionCandidate {
|
||||
timeline,
|
||||
last_activity_ts: layer_info.last_activity_ts,
|
||||
layer: layer_info.layer,
|
||||
relative_last_activity,
|
||||
};
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
@@ -779,19 +610,8 @@ async fn collect_eviction_candidates(
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
|
||||
match eviction_order {
|
||||
EvictionOrder::AbsoluteAccessed => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.last_activity_ts)
|
||||
});
|
||||
}
|
||||
EvictionOrder::RelativeAccessed { .. } => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
}
|
||||
}
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
}
|
||||
@@ -820,66 +640,6 @@ impl std::ops::Deref for TimelineKey {
|
||||
}
|
||||
}
|
||||
|
||||
/// A totally ordered f32 subset we can use with sorting functions.
|
||||
mod finite_f32 {
|
||||
|
||||
/// A totally ordered f32 subset we can use with sorting functions.
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
pub struct FiniteF32(f32);
|
||||
|
||||
impl std::fmt::Debug for FiniteF32 {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FiniteF32 {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Display::fmt(&self.0, f)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::Eq for FiniteF32 {}
|
||||
|
||||
impl std::cmp::PartialOrd for FiniteF32 {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::cmp::Ord for FiniteF32 {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.0.total_cmp(&other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<f32> for FiniteF32 {
|
||||
type Error = f32;
|
||||
|
||||
fn try_from(value: f32) -> Result<Self, Self::Error> {
|
||||
if value.is_finite() {
|
||||
Ok(FiniteF32(value))
|
||||
} else {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FiniteF32 {
|
||||
pub const ZERO: FiniteF32 = FiniteF32(0.0);
|
||||
|
||||
pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
|
||||
if (0.0..=1.0).contains(&value) {
|
||||
// -0.0 is within the range, make sure it is assumed 0.0..=1.0
|
||||
let value = value.abs();
|
||||
Ok(FiniteF32(value))
|
||||
} else {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod filesystem_level_usage {
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
@@ -961,7 +721,6 @@ mod filesystem_level_usage {
|
||||
|
||||
#[test]
|
||||
fn max_usage_pct_pressure() {
|
||||
use super::EvictionOrder;
|
||||
use super::Usage as _;
|
||||
use std::time::Duration;
|
||||
use utils::serde_percent::Percent;
|
||||
@@ -973,7 +732,6 @@ mod filesystem_level_usage {
|
||||
period: Duration::MAX,
|
||||
#[cfg(feature = "testing")]
|
||||
mock_statvfs: None,
|
||||
eviction_order: EvictionOrder::default(),
|
||||
},
|
||||
total_bytes: 100_000,
|
||||
avail_bytes: 0,
|
||||
|
||||
86
pageserver/src/failpoint_support.rs
Normal file
86
pageserver/src/failpoint_support.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
///
|
||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||
/// specified time (in milliseconds). The main difference is that we use async
|
||||
/// tokio sleep function. Another difference is that we print lines to the log,
|
||||
/// which can be useful in tests to check that the failpoint was hit.
|
||||
#[macro_export]
|
||||
macro_rules! __failpoint_sleep_millis_async {
|
||||
($name:literal) => {{
|
||||
// If the failpoint is used with a "return" action, set should_sleep to the
|
||||
// returned value (as string). Otherwise it's set to None.
|
||||
let should_sleep = (|| {
|
||||
::fail::fail_point!($name, |x| x);
|
||||
::std::option::Option::None
|
||||
})();
|
||||
|
||||
// Sleep if the action was a returned value
|
||||
if let ::std::option::Option::Some(duration_str) = should_sleep {
|
||||
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
|
||||
}
|
||||
}};
|
||||
}
|
||||
pub use __failpoint_sleep_millis_async as sleep_millis_async;
|
||||
|
||||
// Helper function used by the macro. (A function has nicer scoping so we
|
||||
// don't need to decorate everything with "::")
|
||||
#[doc(hidden)]
|
||||
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||
let millis = duration_str.parse::<u64>().unwrap();
|
||||
let d = std::time::Duration::from_millis(millis);
|
||||
|
||||
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
|
||||
tokio::time::sleep(d).await;
|
||||
tracing::info!("failpoint {:?}: sleep done", name);
|
||||
}
|
||||
|
||||
pub fn init() -> fail::FailScenario<'static> {
|
||||
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
|
||||
// We want non-default behavior for `exit`, though, so, we handle it separately.
|
||||
//
|
||||
// Format for FAILPOINTS is "name=actions" separated by ";".
|
||||
let actions = std::env::var("FAILPOINTS");
|
||||
if actions.is_ok() {
|
||||
std::env::remove_var("FAILPOINTS");
|
||||
} else {
|
||||
// let the library handle non-utf8, or nothing for not present
|
||||
}
|
||||
|
||||
let scenario = fail::FailScenario::setup();
|
||||
|
||||
if let Ok(val) = actions {
|
||||
val.split(';')
|
||||
.enumerate()
|
||||
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
|
||||
.for_each(|res| {
|
||||
let (name, actions) = match res {
|
||||
Ok(t) => t,
|
||||
Err((i, s)) => {
|
||||
panic!(
|
||||
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
|
||||
i + 1,
|
||||
);
|
||||
}
|
||||
};
|
||||
if let Err(e) = apply_failpoint(name, actions) {
|
||||
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
scenario
|
||||
}
|
||||
|
||||
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
||||
if actions == "exit" {
|
||||
fail::cfg_callback(name, exit_failpoint)
|
||||
} else {
|
||||
fail::cfg(name, actions)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn exit_failpoint() {
|
||||
tracing::info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
}
|
||||
@@ -159,12 +159,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"412":
|
||||
description: Deletion may not proceed, tenant is not in Active state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
|
||||
@@ -14,8 +14,6 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
@@ -26,7 +24,6 @@ use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
@@ -38,7 +35,6 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::UpsertLocationError;
|
||||
use crate::tenant::mgr::{
|
||||
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlotError, TenantSlotUpsertError, TenantStateError,
|
||||
@@ -48,8 +44,7 @@ use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::SpawnMode;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use pageserver_api::models::{
|
||||
@@ -70,6 +65,9 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
// Imports only used for testing APIs
|
||||
use pageserver_api::models::ConfigureFailpointsRequest;
|
||||
|
||||
// For APIs that require an Active tenant, how long should we block waiting for that state?
|
||||
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
||||
// failed API calls while tenants are activating.
|
||||
@@ -115,6 +113,14 @@ impl State {
|
||||
secondary_controller,
|
||||
})
|
||||
}
|
||||
|
||||
fn tenant_resources(&self) -> TenantSharedResources {
|
||||
TenantSharedResources {
|
||||
broker_client: self.broker_client.clone(),
|
||||
remote_storage: self.remote_storage.clone(),
|
||||
deletion_queue_client: self.deletion_queue_client.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -147,7 +153,6 @@ impl From<PageReconstructError> for ApiError {
|
||||
PageReconstructError::AncestorStopping(_) => {
|
||||
ApiError::ResourceUnavailable(format!("{pre}").into())
|
||||
}
|
||||
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||
}
|
||||
}
|
||||
@@ -170,7 +175,7 @@ impl From<TenantSlotError> for ApiError {
|
||||
NotFound(tenant_id) => {
|
||||
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
||||
}
|
||||
e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
|
||||
e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
|
||||
InProgress => {
|
||||
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
||||
}
|
||||
@@ -189,18 +194,6 @@ impl From<TenantSlotUpsertError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<UpsertLocationError> for ApiError {
|
||||
fn from(e: UpsertLocationError) -> ApiError {
|
||||
use UpsertLocationError::*;
|
||||
match e {
|
||||
BadRequest(e) => ApiError::BadRequest(e),
|
||||
Unavailable(_) => ApiError::ShuttingDown,
|
||||
e @ InProgress => ApiError::Conflict(format!("{e}")),
|
||||
Flush(e) | Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantMapError> for ApiError {
|
||||
fn from(e: TenantMapError) -> ApiError {
|
||||
use TenantMapError::*;
|
||||
@@ -314,7 +307,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
SlotUpsertError(e) => e.into(),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
Cancelled => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -323,21 +315,11 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
force_await_initial_logical_size: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
if force_await_initial_logical_size {
|
||||
timeline.clone().await_initial_logical_size().await
|
||||
}
|
||||
|
||||
let mut info = build_timeline_info_common(
|
||||
timeline,
|
||||
ctx,
|
||||
tenant::timeline::GetLogicalSizePriority::Background,
|
||||
)
|
||||
.await?;
|
||||
let mut info = build_timeline_info_common(timeline, ctx).await?;
|
||||
if include_non_incremental_logical_size {
|
||||
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
||||
// Otherwise, if someone deletes the timeline / detaches the tenant while
|
||||
@@ -354,7 +336,6 @@ async fn build_timeline_info(
|
||||
async fn build_timeline_info_common(
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
logical_size_task_priority: tenant::timeline::GetLogicalSizePriority,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let initdb_lsn = timeline.initdb_lsn;
|
||||
@@ -377,7 +358,8 @@ async fn build_timeline_info_common(
|
||||
Lsn(0) => None,
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
};
|
||||
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
||||
let current_logical_size =
|
||||
timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn_projected = timeline
|
||||
@@ -488,7 +470,7 @@ async fn timeline_create_handler(
|
||||
.await {
|
||||
Ok(new_timeline) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
@@ -524,8 +506,6 @@ async fn timeline_list_handler(
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let include_non_incremental_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -539,7 +519,6 @@ async fn timeline_list_handler(
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
|
||||
@@ -567,8 +546,6 @@ async fn timeline_detail_handler(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let include_non_incremental_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "include-non-incremental-logical-size")?;
|
||||
let force_await_initial_logical_size: Option<bool> =
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
// Logical size calculation needs downloading.
|
||||
@@ -584,7 +561,6 @@ async fn timeline_detail_handler(
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
include_non_incremental_logical_size.unwrap_or(false),
|
||||
force_await_initial_logical_size.unwrap_or(false),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -616,6 +592,8 @@ async fn get_lsn_by_timestamp_handler(
|
||||
)));
|
||||
}
|
||||
|
||||
let version: Option<u8> = parse_query_param(&request, "version")?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let timestamp_raw = must_get_query_param(&request, "timestamp")?;
|
||||
let timestamp = humantime::parse_rfc3339(×tamp_raw)
|
||||
@@ -628,18 +606,31 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
|
||||
.await?;
|
||||
#[derive(serde::Serialize)]
|
||||
struct Result {
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
|
||||
if version.unwrap_or(0) > 1 {
|
||||
#[derive(serde::Serialize)]
|
||||
struct Result {
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
}
|
||||
let (lsn, kind) = match result {
|
||||
LsnForTimestamp::Present(lsn) => (lsn, "present"),
|
||||
LsnForTimestamp::Future(lsn) => (lsn, "future"),
|
||||
LsnForTimestamp::Past(lsn) => (lsn, "past"),
|
||||
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
|
||||
};
|
||||
json_response(StatusCode::OK, Result { lsn, kind })
|
||||
} else {
|
||||
// FIXME: this is a temporary crutch not to break backwards compatibility
|
||||
// See https://github.com/neondatabase/neon/pull/5608
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
let (lsn, kind) = match result {
|
||||
LsnForTimestamp::Present(lsn) => (lsn, "present"),
|
||||
LsnForTimestamp::Future(lsn) => (lsn, "future"),
|
||||
LsnForTimestamp::Past(lsn) => (lsn, "past"),
|
||||
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
|
||||
};
|
||||
json_response(StatusCode::OK, Result { lsn, kind })
|
||||
}
|
||||
|
||||
async fn get_timestamp_of_lsn_handler(
|
||||
@@ -703,37 +694,16 @@ async fn tenant_attach_handler(
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
tenant_shard_id,
|
||||
location_conf,
|
||||
None,
|
||||
SpawnMode::Normal,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let Some(tenant) = tenant else {
|
||||
// This should never happen: indicates a bug in upsert_location
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Upsert succeeded but didn't return tenant!"
|
||||
)));
|
||||
};
|
||||
|
||||
// We might have successfully constructed a Tenant, but it could still
|
||||
// end up in a broken state:
|
||||
if let TenantState::Broken {
|
||||
reason,
|
||||
backtrace: _,
|
||||
} = tenant.current_state()
|
||||
{
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Tenant state is Broken: {reason}"
|
||||
)));
|
||||
}
|
||||
mgr::attach_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
tenant_conf,
|
||||
state.tenant_resources(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_attach", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
@@ -902,14 +872,11 @@ async fn tenant_status(
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
Result::<_, ApiError>::Ok(TenantDetails {
|
||||
tenant_info: TenantInfo {
|
||||
id: tenant_shard_id,
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
attachment_status: state.attachment_status(),
|
||||
},
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
Result::<_, ApiError>::Ok(TenantInfo {
|
||||
id: tenant_shard_id,
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
attachment_status: state.attachment_status(),
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler",
|
||||
@@ -930,9 +897,7 @@ async fn tenant_delete_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
state
|
||||
.tenant_manager
|
||||
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
|
||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
|
||||
.instrument(info_span!("tenant_delete_handler",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard = %tenant_shard_id.shard_slug()
|
||||
@@ -1192,25 +1157,16 @@ async fn tenant_create_handler(
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||
|
||||
let new_tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
target_tenant_id,
|
||||
location_conf,
|
||||
None,
|
||||
SpawnMode::Create,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let Some(new_tenant) = new_tenant else {
|
||||
// This should never happen: indicates a bug in upsert_location
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Upsert succeeded but didn't return tenant!"
|
||||
)));
|
||||
};
|
||||
let new_tenant = mgr::create_tenant(
|
||||
state.conf,
|
||||
tenant_conf,
|
||||
target_tenant_id,
|
||||
generation,
|
||||
state.tenant_resources(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
|
||||
.await?;
|
||||
|
||||
// We created the tenant. Existing API semantics are that the tenant
|
||||
// is Active when this function returns.
|
||||
@@ -1219,7 +1175,7 @@ async fn tenant_create_handler(
|
||||
.await
|
||||
{
|
||||
// This shouldn't happen because we just created the tenant directory
|
||||
// in upsert_location, and there aren't any remote timelines
|
||||
// in tenant::mgr::create_tenant, and there aren't any remote timelines
|
||||
// to load, so, nothing can really fail during load.
|
||||
// Don't do cleanup because we don't know how we got here.
|
||||
// The tenant will likely be in `Broken` state and subsequent
|
||||
@@ -1320,31 +1276,12 @@ async fn put_tenant_location_config_handler(
|
||||
|
||||
state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
tenant_shard_id,
|
||||
location_conf,
|
||||
flush,
|
||||
tenant::SpawnMode::Normal,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if let Some(_flush_ms) = flush {
|
||||
match state
|
||||
.secondary_controller
|
||||
.upload_tenant(tenant_shard_id)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
tracing::info!("Uploaded heatmap during flush");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to flush heatmap: {e}");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tracing::info!("No flush requested when configuring");
|
||||
}
|
||||
.upsert_location(tenant_shard_id, location_conf, flush, &ctx)
|
||||
.await
|
||||
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
|
||||
// principle we might have hit something like concurrent API calls to the same tenant,
|
||||
// which is not a 400 but a 409.
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1364,6 +1301,34 @@ async fn handle_tenant_break(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn failpoints_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Cannot manage failpoints because pageserver was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
async fn timeline_gc_handler(
|
||||
mut request: Request<Body>,
|
||||
@@ -1612,22 +1577,19 @@ async fn disk_usage_eviction_run(
|
||||
struct Config {
|
||||
/// How many bytes to evict before reporting that pressure is relieved.
|
||||
evict_bytes: u64,
|
||||
|
||||
#[serde(default)]
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, serde::Serialize)]
|
||||
struct Usage {
|
||||
// remains unchanged after instantiation of the struct
|
||||
evict_bytes: u64,
|
||||
config: Config,
|
||||
// updated by `add_available_bytes`
|
||||
freed_bytes: u64,
|
||||
}
|
||||
|
||||
impl crate::disk_usage_eviction_task::Usage for Usage {
|
||||
fn has_pressure(&self) -> bool {
|
||||
self.evict_bytes > self.freed_bytes
|
||||
self.config.evict_bytes > self.freed_bytes
|
||||
}
|
||||
|
||||
fn add_available_bytes(&mut self, bytes: u64) {
|
||||
@@ -1638,7 +1600,7 @@ async fn disk_usage_eviction_run(
|
||||
let config = json_request::<Config>(&mut r).await?;
|
||||
|
||||
let usage = Usage {
|
||||
evict_bytes: config.evict_bytes,
|
||||
config,
|
||||
freed_bytes: 0,
|
||||
};
|
||||
|
||||
@@ -1653,11 +1615,7 @@ async fn disk_usage_eviction_run(
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&state,
|
||||
storage,
|
||||
usage,
|
||||
config.eviction_order,
|
||||
&cancel,
|
||||
&state, storage, usage, &cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -1683,21 +1641,6 @@ async fn secondary_upload_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn secondary_download_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
state
|
||||
.secondary_controller
|
||||
.download_tenant(tenant_shard_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -1966,9 +1909,6 @@ pub fn make_router(
|
||||
.put("/v1/deletion_queue/flush", |r| {
|
||||
api_handler(r, deletion_queue_flush)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
|
||||
api_handler(r, secondary_download_handler)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_shard_id/break", |r| {
|
||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||
})
|
||||
|
||||
@@ -21,7 +21,6 @@ use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::Timeline;
|
||||
@@ -313,16 +312,13 @@ async fn import_wal(
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut modification = tline.begin_modification(endpoint);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.await?;
|
||||
WAL_INGEST.records_committed.inc();
|
||||
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
nrecords += 1;
|
||||
@@ -452,14 +448,13 @@ pub async fn import_wal_from_tar(
|
||||
|
||||
waldecoder.feed_bytes(&bytes[offset..]);
|
||||
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
let mut modification = tline.begin_modification(end_lsn);
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest
|
||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||
.await?;
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||
|
||||
@@ -25,6 +25,8 @@ pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
pub mod failpoint_support;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
@@ -117,10 +119,6 @@ pub const TENANT_CONFIG_NAME: &str = "config";
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
|
||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||
/// tenant path while in secondary mode.
|
||||
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
|
||||
/// A suffix used for various temporary files. Any temporary files found in the
|
||||
/// data directory at pageserver startup can be automatically removed.
|
||||
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
|
||||
// Metrics collected on operations on the storage repository.
|
||||
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum StorageTimeOperation {
|
||||
pub enum StorageTimeOperation {
|
||||
#[strum(serialize = "layer flush")]
|
||||
LayerFlush,
|
||||
|
||||
@@ -55,7 +55,7 @@ pub(crate) enum StorageTimeOperation {
|
||||
CreateTenant,
|
||||
}
|
||||
|
||||
pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
||||
pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
||||
register_counter_vec!(
|
||||
"pageserver_storage_operations_seconds_sum",
|
||||
"Total time spent on storage operations with operation, tenant and timeline dimensions",
|
||||
@@ -64,7 +64,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_storage_operations_seconds_count",
|
||||
"Count of storage operations with operation, tenant and timeline dimensions",
|
||||
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct PageCacheMetricsForTaskKind {
|
||||
pub struct PageCacheMetricsForTaskKind {
|
||||
pub read_accesses_materialized_page: IntCounter,
|
||||
pub read_accesses_immutable: IntCounter,
|
||||
|
||||
@@ -159,7 +159,7 @@ pub(crate) struct PageCacheMetricsForTaskKind {
|
||||
pub read_hits_materialized_page_older_lsn: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) struct PageCacheMetrics {
|
||||
pub struct PageCacheMetrics {
|
||||
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
|
||||
}
|
||||
|
||||
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
||||
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
||||
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
|
||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
|
||||
let task_kind: &'static str = task_kind.into();
|
||||
@@ -243,9 +243,10 @@ impl PageCacheMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct PageCacheSizeMetrics {
|
||||
pub struct PageCacheSizeMetrics {
|
||||
pub max_bytes: UIntGauge,
|
||||
|
||||
pub current_bytes_ephemeral: UIntGauge,
|
||||
pub current_bytes_immutable: UIntGauge,
|
||||
pub current_bytes_materialized_page: UIntGauge,
|
||||
}
|
||||
@@ -259,26 +260,31 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
|
||||
Lazy::new(|| PageCacheSizeMetrics {
|
||||
max_bytes: {
|
||||
register_uint_gauge!(
|
||||
"pageserver_page_cache_size_max_bytes",
|
||||
"Maximum size of the page cache in bytes"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
},
|
||||
current_bytes_immutable: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["immutable"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_materialized_page: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["materialized_page"])
|
||||
.unwrap()
|
||||
},
|
||||
});
|
||||
pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
|
||||
max_bytes: {
|
||||
register_uint_gauge!(
|
||||
"pageserver_page_cache_size_max_bytes",
|
||||
"Maximum size of the page cache in bytes"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
},
|
||||
|
||||
current_bytes_ephemeral: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["ephemeral"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_immutable: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["immutable"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_materialized_page: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["materialized_page"])
|
||||
.unwrap()
|
||||
},
|
||||
});
|
||||
|
||||
pub(crate) mod page_cache_eviction_metrics {
|
||||
use std::num::NonZeroUsize;
|
||||
@@ -734,13 +740,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
|
||||
|
||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct EvictionsWithLowResidenceDuration {
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
counter: Option<IntCounter>,
|
||||
}
|
||||
|
||||
pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
|
||||
pub struct EvictionsWithLowResidenceDurationBuilder {
|
||||
data_source: &'static str,
|
||||
threshold: Duration,
|
||||
}
|
||||
@@ -1003,7 +1009,7 @@ pub enum SmgrQueryType {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
pub struct SmgrQueryTimePerTimeline {
|
||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
||||
}
|
||||
|
||||
@@ -1175,8 +1181,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
.map(|ms| (ms as f64) / 1000.0)
|
||||
});
|
||||
|
||||
pub(crate) struct BasebackupQueryTime(HistogramVec);
|
||||
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
pub struct BasebackupQueryTime(HistogramVec);
|
||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
BasebackupQueryTime({
|
||||
register_histogram_vec!(
|
||||
"pageserver_basebackup_query_seconds",
|
||||
@@ -1196,7 +1202,7 @@ impl DurationResultObserver for BasebackupQueryTime {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_live_connections",
|
||||
"Number of live network connections",
|
||||
@@ -1363,8 +1369,6 @@ pub(crate) struct SecondaryModeMetrics {
|
||||
pub(crate) upload_heatmap: IntCounter,
|
||||
pub(crate) upload_heatmap_errors: IntCounter,
|
||||
pub(crate) upload_heatmap_duration: Histogram,
|
||||
pub(crate) download_heatmap: IntCounter,
|
||||
pub(crate) download_layer: IntCounter,
|
||||
}
|
||||
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
|
||||
upload_heatmap: register_int_counter!(
|
||||
@@ -1382,16 +1386,6 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
|
||||
"Time to build and upload a heatmap, including any waiting inside the S3 client"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
download_heatmap: register_int_counter!(
|
||||
"pageserver_secondary_download_heatmap",
|
||||
"Number of downloads of heatmaps by secondary mode locations"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
download_layer: register_int_counter!(
|
||||
"pageserver_secondary_download_layer",
|
||||
"Number of downloads of layers by secondary mode locations"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
@@ -1661,7 +1655,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
Lazy::new(WalRedoProcessCounters::default);
|
||||
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
pub struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
start: Instant,
|
||||
}
|
||||
@@ -1686,7 +1680,7 @@ impl StorageTimeMetricsTimer {
|
||||
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
|
||||
/// timeline total sum and count.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct StorageTimeMetrics {
|
||||
pub struct StorageTimeMetrics {
|
||||
/// Sum of f64 seconds, per operation, tenant_id and timeline_id
|
||||
timeline_sum: Counter,
|
||||
/// Number of oeprations, per operation, tenant_id and timeline_id
|
||||
@@ -1725,7 +1719,7 @@ impl StorageTimeMetrics {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TimelineMetrics {
|
||||
pub struct TimelineMetrics {
|
||||
tenant_id: String,
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
@@ -1933,7 +1927,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RemoteTimelineClientMetrics {
|
||||
pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||
@@ -2231,7 +2225,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
|
||||
/// Wrapper future that measures the time spent by a remote storage operation,
|
||||
/// and records the time and success/failure as a prometheus metric.
|
||||
pub(crate) trait MeasureRemoteOp: Sized {
|
||||
pub trait MeasureRemoteOp: Sized {
|
||||
fn measure_remote_op(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
@@ -2256,7 +2250,7 @@ pub(crate) trait MeasureRemoteOp: Sized {
|
||||
impl<T: Sized> MeasureRemoteOp for T {}
|
||||
|
||||
pin_project! {
|
||||
pub(crate) struct MeasuredRemoteOp<F>
|
||||
pub struct MeasuredRemoteOp<F>
|
||||
{
|
||||
#[pin]
|
||||
inner: F,
|
||||
|
||||
@@ -25,7 +25,6 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::borrow::Cow;
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::pin::pin;
|
||||
@@ -54,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
|
||||
use crate::pgdatadir_mapping::rel_block_to_key;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -62,9 +61,6 @@ use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::timeline::WaitLsnError;
|
||||
use crate::tenant::GetTimelineError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
@@ -287,64 +283,6 @@ struct PageServerHandler {
|
||||
connection_ctx: RequestContext,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum PageStreamError {
|
||||
/// We encountered an error that should prompt the client to reconnect:
|
||||
/// in practice this means we drop the connection without sending a response.
|
||||
#[error("Reconnect required: {0}")]
|
||||
Reconnect(Cow<'static, str>),
|
||||
|
||||
/// We were instructed to shutdown while processing the query
|
||||
#[error("Shutting down")]
|
||||
Shutdown,
|
||||
|
||||
/// Something went wrong reading a page: this likely indicates a pageserver bug
|
||||
#[error("Read error: {0}")]
|
||||
Read(PageReconstructError),
|
||||
|
||||
/// Ran out of time waiting for an LSN
|
||||
#[error("LSN timeout: {0}")]
|
||||
LsnTimeout(WaitLsnError),
|
||||
|
||||
/// The entity required to serve the request (tenant or timeline) is not found,
|
||||
/// or is not found in a suitable state to serve a request.
|
||||
#[error("Not found: {0}")]
|
||||
NotFound(std::borrow::Cow<'static, str>),
|
||||
|
||||
/// Request asked for something that doesn't make sense, like an invalid LSN
|
||||
#[error("Bad request: {0}")]
|
||||
BadRequest(std::borrow::Cow<'static, str>),
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for PageStreamError {
|
||||
fn from(value: PageReconstructError) -> Self {
|
||||
match value {
|
||||
PageReconstructError::Cancelled => Self::Shutdown,
|
||||
e => Self::Read(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GetActiveTimelineError> for PageStreamError {
|
||||
fn from(value: GetActiveTimelineError) -> Self {
|
||||
match value {
|
||||
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
|
||||
GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
|
||||
GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<WaitLsnError> for PageStreamError {
|
||||
fn from(value: WaitLsnError) -> Self {
|
||||
match value {
|
||||
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
|
||||
WaitLsnError::Shutdown => Self::Shutdown,
|
||||
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -490,7 +428,7 @@ impl PageServerHandler {
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
// Avoid starting new requests if the timeline has already started shutting down,
|
||||
// and block timeline shutdown until this request is complete, or drops out due
|
||||
@@ -582,44 +520,32 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
match response {
|
||||
Err(PageStreamError::Shutdown) => {
|
||||
if let Err(e) = &response {
|
||||
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
|
||||
// because wait_lsn etc will drop out
|
||||
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
|
||||
// is_canceled(): [`Timeline::shutdown`]` has entered
|
||||
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
// connection.
|
||||
span.in_scope(|| info!("dropping connection due to shutdown"));
|
||||
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
Err(PageStreamError::Reconnect(reason)) => {
|
||||
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||
return Err(QueryError::Reconnect);
|
||||
}
|
||||
Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
|
||||
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
|
||||
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
|
||||
//
|
||||
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
|
||||
// because wait_lsn etc will drop out
|
||||
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
|
||||
// is_canceled(): [`Timeline::shutdown`]` has entered
|
||||
span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
r => {
|
||||
let response_msg = r.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -766,7 +692,7 @@ impl PageServerHandler {
|
||||
latest: bool,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Lsn, PageStreamError> {
|
||||
) -> anyhow::Result<Lsn> {
|
||||
if latest {
|
||||
// Latest page version was requested. If LSN is given, it is a hint
|
||||
// to the page server that there have been no modifications to the
|
||||
@@ -797,19 +723,15 @@ impl PageServerHandler {
|
||||
}
|
||||
} else {
|
||||
if lsn == Lsn(0) {
|
||||
return Err(PageStreamError::BadRequest(
|
||||
"invalid LSN(0) in request".into(),
|
||||
));
|
||||
anyhow::bail!("invalid LSN(0) in request");
|
||||
}
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
}
|
||||
|
||||
if lsn < **latest_gc_cutoff_lsn {
|
||||
return Err(PageStreamError::BadRequest(format!(
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
lsn, **latest_gc_cutoff_lsn
|
||||
).into()));
|
||||
}
|
||||
anyhow::ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
lsn, **latest_gc_cutoff_lsn
|
||||
);
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
@@ -818,14 +740,14 @@ impl PageServerHandler {
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamExistsRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let exists = timeline
|
||||
.get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
|
||||
.get_rel_exists(req.rel, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
@@ -838,15 +760,13 @@ impl PageServerHandler {
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamNblocksRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let n_blocks = timeline
|
||||
.get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
|
||||
.await?;
|
||||
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
@@ -858,20 +778,14 @@ impl PageServerHandler {
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamDbSizeRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
|
||||
let total_blocks = timeline
|
||||
.get_db_size(
|
||||
DEFAULTTABLESPACE_OID,
|
||||
req.dbnode,
|
||||
Version::Lsn(lsn),
|
||||
req.latest,
|
||||
ctx,
|
||||
)
|
||||
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||
|
||||
@@ -880,35 +794,30 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
async fn do_handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
) -> anyhow::Result<PagestreamBeMessage> {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn =
|
||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||
.await?;
|
||||
/*
|
||||
// Add a 1s delay to some requests. The delay helps the requests to
|
||||
// hit the race condition from github issue #1047 more easily.
|
||||
use rand::Rng;
|
||||
if rand::thread_rng().gen::<u8>() < 5 {
|
||||
std::thread::sleep(std::time::Duration::from_millis(1000));
|
||||
}
|
||||
*/
|
||||
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
if timeline.get_shard_identity().is_key_local(&key) {
|
||||
self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
|
||||
.await
|
||||
let page = if timeline.get_shard_identity().is_key_local(&key) {
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
} else {
|
||||
// The Tenant shard we looked up at connection start does not hold this particular
|
||||
// key: look for other shards in this tenant. This scenario occurs if a pageserver
|
||||
@@ -927,30 +836,30 @@ impl PageServerHandler {
|
||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||
// mapping is out of date.
|
||||
tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
|
||||
timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
|
||||
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||
// and talk to a different pageserver.
|
||||
return Err(PageStreamError::Reconnect(
|
||||
"getpage@lsn request routed to wrong shard".into(),
|
||||
));
|
||||
// the requested page is not present on this node.
|
||||
|
||||
// TODO: this should be some kind of structured error that the client will understand,
|
||||
// so that it can block until its config is updated: this error is expected in the case
|
||||
// that the Tenant's shards' placements are being updated and the client hasn't been
|
||||
// informed yet.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/6038
|
||||
return Err(anyhow::anyhow!("Request routed to wrong shard"));
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
|
||||
// the GateGuard was already held over the whole connection.
|
||||
let _timeline_guard = timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| PageStreamError::Shutdown)?;
|
||||
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
};
|
||||
|
||||
self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
|
||||
.await
|
||||
}
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
}))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -1091,7 +1000,9 @@ impl PageServerHandler {
|
||||
)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
@@ -1500,8 +1411,7 @@ impl From<GetActiveTenantError> for QueryError {
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
|
||||
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
||||
),
|
||||
GetActiveTenantError::Cancelled
|
||||
| GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
|
||||
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
|
||||
QueryError::Shutdown
|
||||
}
|
||||
e => QueryError::Other(anyhow::anyhow!(e)),
|
||||
@@ -1514,15 +1424,14 @@ enum GetActiveTimelineError {
|
||||
#[error(transparent)]
|
||||
Tenant(GetActiveTenantError),
|
||||
#[error(transparent)]
|
||||
Timeline(#[from] GetTimelineError),
|
||||
Timeline(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<GetActiveTimelineError> for QueryError {
|
||||
fn from(e: GetActiveTimelineError) -> Self {
|
||||
match e {
|
||||
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
|
||||
GetActiveTimelineError::Tenant(e) => e.into(),
|
||||
GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
|
||||
GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
@@ -147,7 +147,6 @@ impl Timeline {
|
||||
{
|
||||
DatadirModification {
|
||||
tline: self,
|
||||
pending_lsns: Vec::new(),
|
||||
pending_updates: HashMap::new(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
@@ -160,11 +159,11 @@ impl Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page version.
|
||||
pub(crate) async fn get_rel_page_at_lsn(
|
||||
pub async fn get_rel_page_at_lsn(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
blknum: BlockNumber,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
@@ -174,47 +173,44 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
|
||||
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
|
||||
if blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag,
|
||||
blknum,
|
||||
version.get_lsn(),
|
||||
nblocks
|
||||
tag, blknum, lsn, nblocks
|
||||
);
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
version.get(self, key, ctx).await
|
||||
self.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
pub(crate) async fn get_db_size(
|
||||
pub async fn get_db_size(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<usize, PageReconstructError> {
|
||||
let mut total_blocks = 0;
|
||||
|
||||
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
|
||||
let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
|
||||
|
||||
for rel in rels {
|
||||
let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
|
||||
let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
|
||||
total_blocks += n_blocks as usize;
|
||||
}
|
||||
Ok(total_blocks)
|
||||
}
|
||||
|
||||
/// Get size of a relation file
|
||||
pub(crate) async fn get_rel_size(
|
||||
pub async fn get_rel_size(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
@@ -224,12 +220,12 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return Ok(nblocks);
|
||||
}
|
||||
|
||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||
&& !self.get_rel_exists(tag, version, latest, ctx).await?
|
||||
&& !self.get_rel_exists(tag, lsn, latest, ctx).await?
|
||||
{
|
||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||
// FSM, and smgrnblocks() on it immediately afterwards,
|
||||
@@ -239,7 +235,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let key = rel_size_to_key(tag);
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
let mut buf = self.get(key, lsn, ctx).await?;
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
if latest {
|
||||
@@ -250,16 +246,16 @@ impl Timeline {
|
||||
// latest=true, then it can not cause cache corruption, because with latest=true
|
||||
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
|
||||
// associated with most recent value of LSN.
|
||||
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
||||
self.update_cached_rel_size(tag, lsn, nblocks);
|
||||
}
|
||||
Ok(nblocks)
|
||||
}
|
||||
|
||||
/// Does relation exist?
|
||||
pub(crate) async fn get_rel_exists(
|
||||
pub async fn get_rel_exists(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
_latest: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
@@ -270,12 +266,12 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// first try to lookup relation in cache
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return Ok(true);
|
||||
}
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -291,16 +287,16 @@ impl Timeline {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub(crate) async fn list_rels(
|
||||
pub async fn list_rels(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -319,7 +315,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
pub(crate) async fn get_slru_page_at_lsn(
|
||||
pub async fn get_slru_page_at_lsn(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
@@ -332,29 +328,29 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub(crate) async fn get_slru_segment_size(
|
||||
pub async fn get_slru_segment_size(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
let mut buf = self.get(key, lsn, ctx).await?;
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub(crate) async fn get_slru_segment_exists(
|
||||
pub async fn get_slru_segment_exists(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -372,7 +368,7 @@ impl Timeline {
|
||||
/// so it's not well defined which LSN you get if there were multiple commits
|
||||
/// "in flight" at that point in time.
|
||||
///
|
||||
pub(crate) async fn find_lsn_for_timestamp(
|
||||
pub async fn find_lsn_for_timestamp(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
cancel: &CancellationToken,
|
||||
@@ -452,7 +448,7 @@ impl Timeline {
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
///
|
||||
pub(crate) async fn is_latest_commit_timestamp_ge_than(
|
||||
pub async fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
probe_lsn: Lsn,
|
||||
@@ -475,7 +471,7 @@ impl Timeline {
|
||||
/// Obtain the possible timestamp range for the given lsn.
|
||||
///
|
||||
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
||||
pub(crate) async fn get_timestamp_for_lsn(
|
||||
pub async fn get_timestamp_for_lsn(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -505,11 +501,11 @@ impl Timeline {
|
||||
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
||||
) -> Result<T, PageReconstructError> {
|
||||
for segno in self
|
||||
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
|
||||
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
||||
.await?
|
||||
{
|
||||
let nblocks = self
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
|
||||
.await?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page = self
|
||||
@@ -532,36 +528,36 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub(crate) async fn list_slru_segments(
|
||||
pub async fn list_slru_segments(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<u32>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let key = slru_dir_to_key(kind);
|
||||
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.segments),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_relmap_file(
|
||||
pub async fn get_relmap_file(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
version: Version<'_>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = relmap_file_key(spcnode, dbnode);
|
||||
|
||||
let buf = version.get(self, key, ctx).await?;
|
||||
let buf = self.get(key, lsn, ctx).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub(crate) async fn list_dbdirs(
|
||||
pub async fn list_dbdirs(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -575,7 +571,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_twophase_file(
|
||||
pub async fn get_twophase_file(
|
||||
&self,
|
||||
xid: TransactionId,
|
||||
lsn: Lsn,
|
||||
@@ -586,7 +582,7 @@ impl Timeline {
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub(crate) async fn list_twophase_files(
|
||||
pub async fn list_twophase_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -600,7 +596,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_control_file(
|
||||
pub async fn get_control_file(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -608,7 +604,7 @@ impl Timeline {
|
||||
self.get(CONTROLFILE_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn get_checkpoint(
|
||||
pub async fn get_checkpoint(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -616,7 +612,7 @@ impl Timeline {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn list_aux_files(
|
||||
pub async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
@@ -656,10 +652,7 @@ impl Timeline {
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in self
|
||||
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
|
||||
.await?
|
||||
{
|
||||
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
@@ -699,7 +692,7 @@ impl Timeline {
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> = self
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
|
||||
.list_rels(spcnode, dbnode, lsn, ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
@@ -806,39 +799,18 @@ pub struct DatadirModification<'a> {
|
||||
/// in the state in 'tline' yet.
|
||||
pub tline: &'a Timeline,
|
||||
|
||||
/// Current LSN of the modification
|
||||
lsn: Lsn,
|
||||
/// Lsn assigned by begin_modification
|
||||
pub lsn: Lsn,
|
||||
|
||||
// The modifications are not applied directly to the underlying key-value store.
|
||||
// The put-functions add the modifications here, and they are flushed to the
|
||||
// underlying key-value store by the 'finish' function.
|
||||
pending_lsns: Vec<Lsn>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_updates: HashMap<Key, Value>,
|
||||
pending_deletions: Vec<Range<Key>>,
|
||||
pending_nblocks: i64,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
/// Get the current lsn
|
||||
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
lsn >= self.lsn,
|
||||
"setting an older lsn {} than {} is not allowed",
|
||||
lsn,
|
||||
self.lsn
|
||||
);
|
||||
if lsn > self.lsn {
|
||||
self.pending_lsns.push(self.lsn);
|
||||
self.lsn = lsn;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Initialize a completely new repository.
|
||||
///
|
||||
/// This inserts the directory metadata entries that are assumed to
|
||||
@@ -1012,9 +984,11 @@ impl<'a> DatadirModification<'a> {
|
||||
dbnode: Oid,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let req_lsn = self.tline.get_last_record_lsn();
|
||||
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
|
||||
.get_db_size(spcnode, dbnode, req_lsn, true, ctx)
|
||||
.await?;
|
||||
|
||||
// Remove entry from dbdir
|
||||
@@ -1103,11 +1077,8 @@ impl<'a> DatadirModification<'a> {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(self), true, ctx)
|
||||
.await?
|
||||
{
|
||||
let last_lsn = self.tline.get_last_record_lsn();
|
||||
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
|
||||
let size_key = rel_size_to_key(rel);
|
||||
// Fetch the old size first
|
||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||
@@ -1352,23 +1323,17 @@ impl<'a> DatadirModification<'a> {
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
for (lsn, value) in values {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
} else {
|
||||
retained_pending_updates
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((lsn, value));
|
||||
}
|
||||
let mut retained_pending_updates = HashMap::new();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, self.lsn, &value, ctx).await?;
|
||||
} else {
|
||||
retained_pending_updates.insert(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
self.pending_updates = retained_pending_updates;
|
||||
self.pending_updates.extend(retained_pending_updates);
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1385,28 +1350,18 @@ impl<'a> DatadirModification<'a> {
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
let lsn = self.lsn;
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
writer.put_batch(&self.pending_updates, ctx).await?;
|
||||
self.pending_updates.clear();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
}
|
||||
for key_range in self.pending_deletions.drain(..) {
|
||||
writer.delete(key_range, lsn).await?;
|
||||
}
|
||||
|
||||
if !self.pending_deletions.is_empty() {
|
||||
writer.delete_batch(&self.pending_deletions).await?;
|
||||
self.pending_deletions.clear();
|
||||
}
|
||||
|
||||
self.pending_lsns.push(self.lsn);
|
||||
for pending_lsn in self.pending_lsns.drain(..) {
|
||||
// Ideally, we should be able to call writer.finish_write() only once
|
||||
// with the highest LSN. However, the last_record_lsn variable in the
|
||||
// timeline keeps track of the latest LSN and the immediate previous LSN
|
||||
// so we need to record every LSN to not leave a gap between them.
|
||||
writer.finish_write(pending_lsn);
|
||||
}
|
||||
writer.finish_write(lsn);
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1415,86 +1370,44 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.pending_updates.len() + self.pending_deletions.len()
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
|
||||
}
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||
// Have we already updated the same key? Read the latest pending updated
|
||||
// Have we already updated the same key? Read the pending updated
|
||||
// version in that case.
|
||||
//
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(values) = self.pending_updates.get(&key) {
|
||||
if let Some((_, value)) = values.last() {
|
||||
return if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
};
|
||||
if let Some(value) = self.pending_updates.get(&key) {
|
||||
if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn, ctx).await
|
||||
}
|
||||
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
if let Some((last_lsn, last_value)) = values.last_mut() {
|
||||
if *last_lsn == self.lsn {
|
||||
*last_value = val;
|
||||
return;
|
||||
}
|
||||
}
|
||||
values.push((self.lsn, val));
|
||||
self.pending_updates.insert(key, val);
|
||||
}
|
||||
|
||||
fn delete(&mut self, key_range: Range<Key>) {
|
||||
trace!("DELETE {}-{}", key_range.start, key_range.end);
|
||||
self.pending_deletions.push((key_range, self.lsn));
|
||||
}
|
||||
}
|
||||
|
||||
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
|
||||
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
|
||||
/// need to look up the keys in the modification first before looking them up in the
|
||||
/// timeline to not miss the latest updates.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum Version<'a> {
|
||||
Lsn(Lsn),
|
||||
Modified(&'a DatadirModification<'a>),
|
||||
}
|
||||
|
||||
impl<'a> Version<'a> {
|
||||
async fn get(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
key: Key,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
match self {
|
||||
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
|
||||
Version::Modified(modification) => modification.get(key, ctx).await,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
Version::Lsn(lsn) => *lsn,
|
||||
Version::Modified(modification) => modification.lsn,
|
||||
}
|
||||
self.pending_deletions.push(key_range);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1863,7 +1776,6 @@ pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
@@ -1878,6 +1790,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ impl Statvfs {
|
||||
}
|
||||
|
||||
// NB: allow() because the block count type is u32 on macOS.
|
||||
#[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
|
||||
#[allow(clippy::useless_conversion)]
|
||||
pub fn blocks(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
|
||||
@@ -32,7 +32,7 @@ impl Statvfs {
|
||||
}
|
||||
|
||||
// NB: allow() because the block count type is u32 on macOS.
|
||||
#[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
|
||||
#[allow(clippy::useless_conversion)]
|
||||
pub fn blocks_available(&self) -> u64 {
|
||||
match self {
|
||||
Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
|
||||
|
||||
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
|
||||
// else, but that has not been needed in a long time.
|
||||
std::env::var("TOKIO_WORKER_THREADS")
|
||||
.map(|s| s.parse::<usize>().unwrap())
|
||||
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
|
||||
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -258,9 +258,6 @@ pub enum TaskKind {
|
||||
/// See [`crate::disk_usage_eviction_task`].
|
||||
DiskUsageEviction,
|
||||
|
||||
/// See [`crate::tenant::secondary`].
|
||||
SecondaryDownloads,
|
||||
|
||||
/// See [`crate::tenant::secondary`].
|
||||
SecondaryUploads,
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
//!
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8Path;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
@@ -33,7 +33,6 @@ use tracing::*;
|
||||
use utils::backoff;
|
||||
use utils::completion;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::failpoint_support;
|
||||
use utils::fs_ext;
|
||||
use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::GateGuard;
|
||||
@@ -56,7 +55,6 @@ use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
use self::timeline::WaitLsnError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
@@ -130,13 +128,6 @@ macro_rules! pausable_failpoint {
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
};
|
||||
($name:literal, $cond:expr) => {
|
||||
if cfg!(feature = "testing") {
|
||||
if $cond {
|
||||
pausable_failpoint!($name)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
pub mod blob_io;
|
||||
@@ -603,9 +594,10 @@ impl Tenant {
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
// TODO(sharding): make WalRedoManager shard-aware
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
tenant_shard_id.tenant_id,
|
||||
)));
|
||||
|
||||
let TenantSharedResources {
|
||||
@@ -898,7 +890,7 @@ impl Tenant {
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||
|
||||
let preload = match preload {
|
||||
Some(p) => p,
|
||||
@@ -1010,7 +1002,7 @@ impl Tenant {
|
||||
// IndexPart is the source of truth.
|
||||
self.clean_up_timelines(&existent_timelines)?;
|
||||
|
||||
failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
|
||||
crate::failpoint_support::sleep_millis_async!("attach-before-activate");
|
||||
|
||||
info!("Done");
|
||||
|
||||
@@ -1152,9 +1144,10 @@ impl Tenant {
|
||||
tenant_shard_id: TenantShardId,
|
||||
reason: String,
|
||||
) -> Arc<Tenant> {
|
||||
// TODO(sharding): make WalRedoManager shard-aware
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
tenant_shard_id.tenant_id,
|
||||
)));
|
||||
Arc::new(Tenant::new(
|
||||
TenantState::Broken {
|
||||
@@ -1559,10 +1552,6 @@ impl Tenant {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
|
||||
self.timelines.lock().unwrap().keys().cloned().collect()
|
||||
}
|
||||
|
||||
/// This is used to create the initial 'main' timeline during bootstrapping,
|
||||
/// or when importing a new base backup. The caller is expected to load an
|
||||
/// initial image of the datadir to the new timeline after this.
|
||||
@@ -1766,15 +1755,7 @@ impl Tenant {
|
||||
// decoding the new WAL might need to look up previous pages, relation
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
ancestor_timeline
|
||||
.wait_lsn(*lsn, ctx)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
||||
CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
|
||||
}
|
||||
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
|
||||
})?;
|
||||
ancestor_timeline.wait_lsn(*lsn, ctx).await?;
|
||||
}
|
||||
|
||||
self.branch_timeline(
|
||||
@@ -2043,13 +2024,6 @@ impl Tenant {
|
||||
// It's mesed up.
|
||||
// we just ignore the failure to stop
|
||||
|
||||
// If we're still attaching, fire the cancellation token early to drop out: this
|
||||
// will prevent us flushing, but ensures timely shutdown if some I/O during attach
|
||||
// is very slow.
|
||||
if matches!(self.current_state(), TenantState::Attaching) {
|
||||
self.cancel.cancel();
|
||||
}
|
||||
|
||||
match self.set_stopping(shutdown_progress, false, false).await {
|
||||
Ok(()) => {}
|
||||
Err(SetStoppingError::Broken) => {
|
||||
@@ -2748,10 +2722,6 @@ impl Tenant {
|
||||
"#
|
||||
.to_string();
|
||||
|
||||
fail::fail_point!("tenant-config-before-write", |_| {
|
||||
anyhow::bail!("tenant-config-before-write");
|
||||
});
|
||||
|
||||
// Convert the config to a toml file.
|
||||
conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
|
||||
|
||||
@@ -2865,7 +2835,9 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
|
||||
crate::failpoint_support::sleep_millis_async!(
|
||||
"gc_iteration_internal_after_getting_gc_timelines"
|
||||
);
|
||||
|
||||
// If there is nothing to GC, we don't want any messages in the INFO log.
|
||||
if !gc_timelines.is_empty() {
|
||||
@@ -3158,7 +3130,6 @@ impl Tenant {
|
||||
|
||||
/// For unit tests, make this visible so that other modules can directly create timelines
|
||||
#[cfg(test)]
|
||||
#[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub(crate) async fn bootstrap_timeline_test(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
@@ -3668,6 +3639,140 @@ fn remove_timeline_and_uninit_mark(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn create_tenant_files(
|
||||
conf: &'static PageServerConf,
|
||||
location_conf: &LocationConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> anyhow::Result<Utf8PathBuf> {
|
||||
let target_tenant_directory = conf.tenant_path(tenant_shard_id);
|
||||
anyhow::ensure!(
|
||||
!target_tenant_directory
|
||||
.try_exists()
|
||||
.context("check existence of tenant directory")?,
|
||||
"tenant directory already exists",
|
||||
);
|
||||
|
||||
let temporary_tenant_dir =
|
||||
path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX);
|
||||
debug!("Creating temporary directory structure in {temporary_tenant_dir}");
|
||||
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
|
||||
format!("could not create temporary tenant directory {temporary_tenant_dir}")
|
||||
})?;
|
||||
|
||||
let creation_result = try_create_target_tenant_dir(
|
||||
conf,
|
||||
location_conf,
|
||||
tenant_shard_id,
|
||||
&temporary_tenant_dir,
|
||||
&target_tenant_directory,
|
||||
)
|
||||
.await;
|
||||
|
||||
if creation_result.is_err() {
|
||||
error!(
|
||||
"Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data"
|
||||
);
|
||||
if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
|
||||
error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
|
||||
} else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
|
||||
error!(
|
||||
"Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
creation_result?;
|
||||
|
||||
Ok(target_tenant_directory)
|
||||
}
|
||||
|
||||
async fn try_create_target_tenant_dir(
|
||||
conf: &'static PageServerConf,
|
||||
location_conf: &LocationConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
temporary_tenant_dir: &Utf8Path,
|
||||
target_tenant_directory: &Utf8Path,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let temporary_tenant_timelines_dir = rebase_directory(
|
||||
&conf.timelines_path(tenant_shard_id),
|
||||
target_tenant_directory,
|
||||
temporary_tenant_dir,
|
||||
)
|
||||
.with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?;
|
||||
let temporary_legacy_tenant_config_path = rebase_directory(
|
||||
&conf.tenant_config_path(tenant_shard_id),
|
||||
target_tenant_directory,
|
||||
temporary_tenant_dir,
|
||||
)
|
||||
.with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
|
||||
let temporary_tenant_config_path = rebase_directory(
|
||||
&conf.tenant_location_config_path(tenant_shard_id),
|
||||
target_tenant_directory,
|
||||
temporary_tenant_dir,
|
||||
)
|
||||
.with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
|
||||
|
||||
Tenant::persist_tenant_config_at(
|
||||
tenant_shard_id,
|
||||
&temporary_tenant_config_path,
|
||||
&temporary_legacy_tenant_config_path,
|
||||
location_conf,
|
||||
)
|
||||
.await?;
|
||||
|
||||
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
|
||||
format!(
|
||||
"create tenant {} temporary timelines directory {}",
|
||||
tenant_shard_id, temporary_tenant_timelines_dir,
|
||||
)
|
||||
})?;
|
||||
fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
|
||||
anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
|
||||
});
|
||||
|
||||
// Make sure the current tenant directory entries are durable before renaming.
|
||||
// Without this, a crash may reorder any of the directory entry creations above.
|
||||
crashsafe::fsync(temporary_tenant_dir)
|
||||
.with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?;
|
||||
|
||||
fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
|
||||
format!(
|
||||
"move tenant {} temporary directory {} into the permanent one {}",
|
||||
tenant_shard_id, temporary_tenant_dir, target_tenant_directory
|
||||
)
|
||||
})?;
|
||||
let target_dir_parent = target_tenant_directory.parent().with_context(|| {
|
||||
format!(
|
||||
"get tenant {} dir parent for {}",
|
||||
tenant_shard_id, target_tenant_directory,
|
||||
)
|
||||
})?;
|
||||
crashsafe::fsync(target_dir_parent).with_context(|| {
|
||||
format!(
|
||||
"fsync renamed directory's parent {} for tenant {}",
|
||||
target_dir_parent, tenant_shard_id,
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn rebase_directory(
|
||||
original_path: &Utf8Path,
|
||||
base: &Utf8Path,
|
||||
new_base: &Utf8Path,
|
||||
) -> anyhow::Result<Utf8PathBuf> {
|
||||
let relative_path = original_path.strip_prefix(base).with_context(|| {
|
||||
format!(
|
||||
"Failed to strip base prefix '{}' off path '{}'",
|
||||
base, original_path
|
||||
)
|
||||
})?;
|
||||
Ok(new_base.join(relative_path))
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
/// to get bootstrap data for timeline initialization.
|
||||
async fn run_initdb(
|
||||
@@ -3762,7 +3867,6 @@ pub async fn dump_layerfile_from_path(
|
||||
#[cfg(test)]
|
||||
pub(crate) mod harness {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::Utf8PathBuf;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::fs;
|
||||
@@ -3830,6 +3934,8 @@ pub(crate) mod harness {
|
||||
pub struct TenantHarness {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: TenantConf,
|
||||
// TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id
|
||||
pub(crate) tenant_id: TenantId,
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub generation: Generation,
|
||||
pub shard: ShardIndex,
|
||||
@@ -3891,6 +3997,7 @@ pub(crate) mod harness {
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
tenant_shard_id,
|
||||
generation: Generation::new(0xdeadbeef),
|
||||
shard: ShardIndex::unsharded(),
|
||||
|
||||
@@ -46,8 +46,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
|
||||
@@ -48,9 +48,6 @@ pub(crate) enum DeleteTenantError {
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
@@ -588,7 +585,7 @@ impl DeleteTenantFlow {
|
||||
}
|
||||
break;
|
||||
}
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
|
||||
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
|
||||
// This is unexpected: this secondary tenants should not have been created, and we
|
||||
// are not in a position to shut it down from here.
|
||||
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
|
||||
|
||||
@@ -35,7 +35,7 @@ use crate::tenant::config::{
|
||||
};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
@@ -44,7 +44,6 @@ use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::TenantSharedResources;
|
||||
|
||||
/// For a tenant that appears in TenantsMap, it may either be
|
||||
@@ -58,7 +57,7 @@ use super::TenantSharedResources;
|
||||
/// having a properly acquired generation (Secondary doesn't need a generation)
|
||||
pub(crate) enum TenantSlot {
|
||||
Attached(Arc<Tenant>),
|
||||
Secondary(Arc<SecondaryTenant>),
|
||||
Secondary,
|
||||
/// In this state, other administrative operations acting on the TenantId should
|
||||
/// block, or return a retry indicator equivalent to HTTP 503.
|
||||
InProgress(utils::completion::Barrier),
|
||||
@@ -68,7 +67,7 @@ impl std::fmt::Debug for TenantSlot {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
|
||||
Self::Secondary(_) => write!(f, "Secondary"),
|
||||
Self::Secondary => write!(f, "Secondary"),
|
||||
Self::InProgress(_) => write!(f, "InProgress"),
|
||||
}
|
||||
}
|
||||
@@ -79,7 +78,7 @@ impl TenantSlot {
|
||||
fn get_attached(&self) -> Option<&Arc<Tenant>> {
|
||||
match self {
|
||||
Self::Attached(t) => Some(t),
|
||||
Self::Secondary(_) => None,
|
||||
Self::Secondary => None,
|
||||
Self::InProgress(_) => None,
|
||||
}
|
||||
}
|
||||
@@ -131,7 +130,7 @@ impl TenantsMap {
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
/// resolve this to a fully qualified TenantShardId.
|
||||
fn resolve_attached_shard(
|
||||
fn resolve_shard(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
selector: ShardSelector,
|
||||
@@ -141,27 +140,25 @@ impl TenantsMap {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||
// Ignore all slots that don't contain an attached tenant
|
||||
let tenant = match &slot.1 {
|
||||
TenantSlot::Attached(t) => t,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
match selector {
|
||||
ShardSelector::First => return Some(*slot.0),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return Some(*slot.0)
|
||||
}
|
||||
ShardSelector::Page(key) => {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
if let Some(tenant) = slot.1.get_attached() {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
_ => continue,
|
||||
@@ -467,18 +464,12 @@ pub async fn init_tenant_mgr(
|
||||
*gen
|
||||
} else {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Secondary(secondary_config) => {
|
||||
LocationMode::Secondary(_) => {
|
||||
// We do not require the control plane's permission for secondary mode
|
||||
// tenants, because they do no remote writes and hence require no
|
||||
// generation number
|
||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
|
||||
tenants.insert(
|
||||
tenant_shard_id,
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
secondary_config,
|
||||
)),
|
||||
);
|
||||
tenants.insert(tenant_shard_id, TenantSlot::Secondary);
|
||||
}
|
||||
LocationMode::Attached(_) => {
|
||||
// TODO: augment re-attach API to enable the control plane to
|
||||
@@ -523,7 +514,10 @@ pub async fn init_tenant_mgr(
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
|
||||
tenants.insert(
|
||||
TenantShardId::unsharded(tenant.tenant_id()),
|
||||
TenantSlot::Attached(tenant),
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
|
||||
@@ -670,14 +664,8 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
|
||||
total_attached += 1;
|
||||
}
|
||||
TenantSlot::Secondary(state) => {
|
||||
// We don't need to wait for this individually per-tenant: the
|
||||
// downloader task will be waited on eventually, this cancel
|
||||
// is just to encourage it to drop out if it is doing work
|
||||
// for this tenant right now.
|
||||
state.cancel.cancel();
|
||||
|
||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
|
||||
TenantSlot::Secondary => {
|
||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
|
||||
}
|
||||
TenantSlot::InProgress(notify) => {
|
||||
// InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
|
||||
@@ -754,6 +742,45 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
// caller will log how long we took
|
||||
}
|
||||
|
||||
pub(crate) async fn create_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_shard_id: TenantShardId,
|
||||
generation: Generation,
|
||||
resources: TenantSharedResources,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Tenant>, TenantMapInsertError> {
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||
info!("Creating tenant at location {location_conf:?}");
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||
let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let created_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Create,
|
||||
ctx,
|
||||
)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let created_tenant_id = created_tenant.tenant_id();
|
||||
debug_assert_eq!(created_tenant_id, tenant_shard_id.tenant_id);
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(created_tenant.clone()))?;
|
||||
|
||||
Ok(created_tenant)
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum SetNewTenantConfigError {
|
||||
#[error(transparent)]
|
||||
@@ -785,24 +812,6 @@ pub(crate) async fn set_new_tenant_config(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum UpsertLocationError {
|
||||
#[error("Bad config request: {0}")]
|
||||
BadRequest(anyhow::Error),
|
||||
|
||||
#[error("Cannot change config in this state: {0}")]
|
||||
Unavailable(#[from] TenantMapError),
|
||||
|
||||
#[error("Tenant is already being modified")]
|
||||
InProgress,
|
||||
|
||||
#[error("Failed to flush: {0}")]
|
||||
Flush(anyhow::Error),
|
||||
|
||||
#[error("Internal error: {0}")]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl TenantManager {
|
||||
/// Convenience function so that anyone with a TenantManager can get at the global configuration, without
|
||||
/// having to pass it around everywhere as a separate object.
|
||||
@@ -839,49 +848,27 @@ impl TenantManager {
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||
}
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
None | Some(TenantSlot::Secondary) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_secondary_tenant_shard(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Option<Arc<SecondaryTenant>> {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.ok()
|
||||
.flatten();
|
||||
|
||||
match peek_slot {
|
||||
Some(TenantSlot::Secondary(s)) => Some(s.clone()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn upsert_location(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_location_config: LocationConf,
|
||||
flush: Option<Duration>,
|
||||
spawn_mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
|
||||
) -> Result<(), anyhow::Error> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
info!("configuring tenant location to state {new_location_config:?}");
|
||||
|
||||
enum FastPathModified {
|
||||
Attached(Arc<Tenant>),
|
||||
Secondary(Arc<SecondaryTenant>),
|
||||
}
|
||||
|
||||
// Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
|
||||
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
|
||||
// then we do not need to set the slot to InProgress, we can just call into the
|
||||
// existng tenant.
|
||||
let fast_path_taken = {
|
||||
let modify_tenant = {
|
||||
let locked = self.tenants.read().unwrap();
|
||||
let peek_slot =
|
||||
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
||||
@@ -891,24 +878,16 @@ impl TenantManager {
|
||||
// A transition from Attached to Attached in the same generation, we may
|
||||
// take our fast path and just provide the updated configuration
|
||||
// to the tenant.
|
||||
tenant.set_new_location_config(
|
||||
AttachedTenantConf::try_from(new_location_config.clone())
|
||||
.map_err(UpsertLocationError::BadRequest)?,
|
||||
);
|
||||
tenant.set_new_location_config(AttachedTenantConf::try_from(
|
||||
new_location_config.clone(),
|
||||
)?);
|
||||
|
||||
Some(FastPathModified::Attached(tenant.clone()))
|
||||
Some(tenant.clone())
|
||||
} else {
|
||||
// Different generations, fall through to general case
|
||||
None
|
||||
}
|
||||
}
|
||||
(
|
||||
LocationMode::Secondary(secondary_conf),
|
||||
Some(TenantSlot::Secondary(secondary_tenant)),
|
||||
) => {
|
||||
secondary_tenant.set_config(secondary_conf);
|
||||
Some(FastPathModified::Secondary(secondary_tenant.clone()))
|
||||
}
|
||||
_ => {
|
||||
// Not an Attached->Attached transition, fall through to general case
|
||||
None
|
||||
@@ -917,131 +896,101 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
// Fast-path continued: having dropped out of the self.tenants lock, do the async
|
||||
// phase of writing config and/or waiting for flush, before returning.
|
||||
match fast_path_taken {
|
||||
Some(FastPathModified::Attached(tenant)) => {
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await?;
|
||||
|
||||
// Transition to AttachedStale means we may well hold a valid generation
|
||||
// still, and have been requested to go stale as part of a migration. If
|
||||
// the caller set `flush`, then flush to remote storage.
|
||||
if let LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: _,
|
||||
attach_mode: AttachmentMode::Stale,
|
||||
}) = &new_location_config.mode
|
||||
{
|
||||
if let Some(flush_timeout) = flush {
|
||||
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
|
||||
Ok(Err(e)) => {
|
||||
return Err(UpsertLocationError::Flush(e));
|
||||
}
|
||||
Ok(Ok(_)) => return Ok(Some(tenant)),
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
// phase of waiting for flush, before returning.
|
||||
if let Some(tenant) = modify_tenant {
|
||||
// Transition to AttachedStale means we may well hold a valid generation
|
||||
// still, and have been requested to go stale as part of a migration. If
|
||||
// the caller set `flush`, then flush to remote storage.
|
||||
if let LocationMode::Attached(AttachedLocationConfig {
|
||||
generation: _,
|
||||
attach_mode: AttachmentMode::Stale,
|
||||
}) = &new_location_config.mode
|
||||
{
|
||||
if let Some(flush_timeout) = flush {
|
||||
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
|
||||
Ok(Err(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
Ok(Ok(_)) => return Ok(()),
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
timeout_ms = flush_timeout.as_millis(),
|
||||
"Timed out waiting for flush to remote storage, proceeding anyway."
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(Some(tenant));
|
||||
}
|
||||
Some(FastPathModified::Secondary(_secondary_tenant)) => {
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await?;
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
None => {
|
||||
// Proceed with the general case procedure, where we will shutdown & remove any existing
|
||||
// slot contents and replace with a fresh one
|
||||
}
|
||||
};
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
|
||||
// InProgress value to the slot while we make whatever changes are required. The state for
|
||||
// the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
|
||||
// the state is ill-defined while we're in transition. Transitions are async, but fast: we do
|
||||
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
|
||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
|
||||
.map_err(|e| match e {
|
||||
TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
|
||||
unreachable!("Called with mode Any")
|
||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
|
||||
if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
|
||||
// The case where we keep a Tenant alive was covered above in the special case
|
||||
// for Attached->Attached transitions in the same generation. By this point,
|
||||
// if we see an attached tenant we know it will be discarded and should be
|
||||
// shut down.
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
|
||||
match tenant.get_attach_mode() {
|
||||
AttachmentMode::Single | AttachmentMode::Multi => {
|
||||
// Before we leave our state as the presumed holder of the latest generation,
|
||||
// flush any outstanding deletions to reduce the risk of leaking objects.
|
||||
self.resources.deletion_queue_client.flush_advisory()
|
||||
}
|
||||
TenantSlotError::InProgress => UpsertLocationError::InProgress,
|
||||
TenantSlotError::MapState(s) => UpsertLocationError::Unavailable(s),
|
||||
})?;
|
||||
|
||||
match slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
// The case where we keep a Tenant alive was covered above in the special case
|
||||
// for Attached->Attached transitions in the same generation. By this point,
|
||||
// if we see an attached tenant we know it will be discarded and should be
|
||||
// shut down.
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
|
||||
match tenant.get_attach_mode() {
|
||||
AttachmentMode::Single | AttachmentMode::Multi => {
|
||||
// Before we leave our state as the presumed holder of the latest generation,
|
||||
// flush any outstanding deletions to reduce the risk of leaking objects.
|
||||
self.resources.deletion_queue_client.flush_advisory()
|
||||
}
|
||||
AttachmentMode::Stale => {
|
||||
// If we're stale there's not point trying to flush deletions
|
||||
}
|
||||
};
|
||||
|
||||
info!("Shutting down attached tenant");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
barrier.wait().await;
|
||||
}
|
||||
AttachmentMode::Stale => {
|
||||
// If we're stale there's not point trying to flush deletions
|
||||
}
|
||||
};
|
||||
|
||||
info!("Shutting down attached tenant");
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
barrier.wait().await;
|
||||
}
|
||||
slot_guard.drop_old_value().expect("We just shut it down");
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
info!("Shutting down secondary tenant");
|
||||
state.shutdown().await;
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// This should never happen: acquire_slot should error out
|
||||
// if the contents of a slot were InProgress.
|
||||
return Err(UpsertLocationError::Other(anyhow::anyhow!(
|
||||
"Acquired an InProgress slot, this is a bug."
|
||||
)));
|
||||
}
|
||||
None => {
|
||||
// Slot was vacant, nothing needs shutting down.
|
||||
}
|
||||
slot_guard.drop_old_value().expect("We just shut it down");
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
|
||||
// Directory structure is the same for attached and secondary modes:
|
||||
// create it if it doesn't exist. Timeline load/creation expects the
|
||||
// timelines/ subdir to already exist.
|
||||
//
|
||||
// Does not need to be fsync'd because local storage is just a cache.
|
||||
tokio::fs::create_dir_all(&timelines_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
// Before activating either secondary or attached mode, persist the
|
||||
// configuration, so that on restart we will re-attach (or re-start
|
||||
// secondary) on the tenant.
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
|
||||
|
||||
let new_slot = match &new_location_config.mode {
|
||||
LocationMode::Secondary(secondary_config) => {
|
||||
TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
|
||||
LocationMode::Secondary(_) => {
|
||||
// Directory doesn't need to be fsync'd because if we crash it can
|
||||
// safely be recreated next time this tenant location is configured.
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {tenant_path}"))?;
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
TenantSlot::Secondary
|
||||
}
|
||||
LocationMode::Attached(_attach_config) => {
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
|
||||
// Directory doesn't need to be fsync'd because we do not depend on
|
||||
// it to exist after crashes: it may be recreated when tenant is
|
||||
// re-attached, see https://github.com/neondatabase/neon/issues/5550
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let shard_identity = new_location_config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
@@ -1052,7 +1001,7 @@ impl TenantManager {
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
spawn_mode,
|
||||
SpawnMode::Normal,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
@@ -1060,20 +1009,9 @@ impl TenantManager {
|
||||
}
|
||||
};
|
||||
|
||||
let attached_tenant = if let TenantSlot::Attached(tenant) = &new_slot {
|
||||
Some(tenant.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
slot_guard.upsert(new_slot)?;
|
||||
|
||||
slot_guard.upsert(new_slot).map_err(|e| match e {
|
||||
TenantSlotUpsertError::InternalError(e) => {
|
||||
UpsertLocationError::Other(anyhow::anyhow!(e))
|
||||
}
|
||||
TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
|
||||
})?;
|
||||
|
||||
Ok(attached_tenant)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
|
||||
@@ -1164,95 +1102,6 @@ impl TenantManager {
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
// Do some synchronous work for all tenant slots in Secondary state. The provided
|
||||
// callback should be small and fast, as it will be called inside the global
|
||||
// TenantsMap lock.
|
||||
pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
|
||||
where
|
||||
// TODO: let the callback return a hint to drop out of the loop early
|
||||
F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
|
||||
{
|
||||
let locked = self.tenants.read().unwrap();
|
||||
|
||||
let map = match &*locked {
|
||||
TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
|
||||
TenantsMap::Open(m) => m,
|
||||
};
|
||||
|
||||
for (tenant_id, slot) in map {
|
||||
if let TenantSlot::Secondary(state) = slot {
|
||||
// Only expose secondary tenants that are not currently shutting down
|
||||
if !state.cancel.is_cancelled() {
|
||||
func(tenant_id, state)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
activation_timeout: Duration,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
//
|
||||
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
|
||||
// Currently, deletion requires a reference to the tenants map in order to
|
||||
// keep the Tenant in the map until deletion is complete, and then remove
|
||||
// it at the end.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
TenantSlot::Attached(tenant) => tenant.clone(),
|
||||
_ => {
|
||||
// Express "not attached" as equivalent to "not found"
|
||||
return Err(DeleteTenantError::NotAttached);
|
||||
}
|
||||
};
|
||||
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// If a tenant is broken or stopping, DeleteTenantFlow can
|
||||
// handle it: broken tenants proceed to delete, stopping tenants
|
||||
// are checked for deletion already in progress.
|
||||
}
|
||||
_ => {
|
||||
tenant
|
||||
.wait_to_become_active(activation_timeout)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => {
|
||||
DeleteTenantError::InvalidState(tenant.current_state())
|
||||
}
|
||||
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
|
||||
GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
|
||||
GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state: _latest_state,
|
||||
wait_time: _wait_time,
|
||||
} => DeleteTenantError::InvalidState(tenant.current_state()),
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
let result = DeleteTenantFlow::run(
|
||||
self.conf,
|
||||
self.resources.remote_storage.clone(),
|
||||
&TENANTS,
|
||||
tenant,
|
||||
)
|
||||
.await;
|
||||
|
||||
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
|
||||
slot_guard.revert();
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1302,7 +1151,7 @@ pub(crate) fn get_tenant(
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||
}
|
||||
None | Some(TenantSlot::Secondary(_)) => {
|
||||
None | Some(TenantSlot::Secondary) => {
|
||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||
}
|
||||
}
|
||||
@@ -1354,11 +1203,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
// Resolve TenantId to TenantShardId
|
||||
let tenant_shard_id = locked
|
||||
.resolve_attached_shard(&tenant_id, shard_selector)
|
||||
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||
tenant_id,
|
||||
)))?;
|
||||
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
|
||||
)?;
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.map_err(GetTenantError::MapState)?;
|
||||
@@ -1375,7 +1222,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(TenantSlot::Secondary(_)) => {
|
||||
Some(TenantSlot::Secondary) => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
|
||||
tenant_id,
|
||||
)))
|
||||
@@ -1432,6 +1279,41 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
//
|
||||
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
|
||||
// Currently, deletion requires a reference to the tenants map in order to
|
||||
// keep the Tenant in the map until deletion is complete, and then remove
|
||||
// it at the end.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
// TODO(sharding): make delete API sharding-aware
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
TenantSlot::Attached(tenant) => tenant.clone(),
|
||||
_ => {
|
||||
// Express "not attached" as equivalent to "not found"
|
||||
return Err(DeleteTenantError::NotAttached);
|
||||
}
|
||||
};
|
||||
|
||||
let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
|
||||
|
||||
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
|
||||
slot_guard.revert();
|
||||
result
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum DeleteTimelineError {
|
||||
#[error("Tenant {0}")]
|
||||
@@ -1639,12 +1521,61 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
|
||||
TenantSlot::Secondary(_) => None,
|
||||
TenantSlot::Secondary => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Execute Attach mgmt API command.
|
||||
///
|
||||
/// Downloading all the tenant data is performed in the background, this merely
|
||||
/// spawns the background task and returns quickly.
|
||||
pub(crate) async fn attach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
tenant_conf: TenantConfOpt,
|
||||
resources: TenantSharedResources,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||
let location_conf = LocationConf::attached_single(tenant_conf, generation);
|
||||
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let attached_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
ctx,
|
||||
)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let attached_tenant_id = attached_tenant.tenant_id();
|
||||
if tenant_id != attached_tenant_id {
|
||||
return Err(TenantMapInsertError::Other(anyhow::anyhow!(
|
||||
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
|
||||
)));
|
||||
}
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(attached_tenant))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum TenantMapInsertError {
|
||||
#[error(transparent)]
|
||||
@@ -1658,7 +1589,7 @@ pub(crate) enum TenantMapInsertError {
|
||||
/// Superset of TenantMapError: issues that can occur when acquiring a slot
|
||||
/// for a particular tenant ID.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum TenantSlotError {
|
||||
pub enum TenantSlotError {
|
||||
/// When acquiring a slot with the expectation that the tenant already exists.
|
||||
#[error("Tenant {0} not found")]
|
||||
NotFound(TenantShardId),
|
||||
@@ -1667,6 +1598,9 @@ pub(crate) enum TenantSlotError {
|
||||
#[error("tenant {0} already exists, state: {1:?}")]
|
||||
AlreadyExists(TenantShardId, TenantState),
|
||||
|
||||
#[error("tenant {0} already exists in but is not attached")]
|
||||
Conflict(TenantShardId),
|
||||
|
||||
// Tried to read a slot that is currently being mutated by another administrative
|
||||
// operation.
|
||||
#[error("tenant has a state change in progress, try again later")]
|
||||
@@ -1844,7 +1778,11 @@ impl SlotGuard {
|
||||
fn old_value_is_shutdown(&self) -> bool {
|
||||
match self.old_value.as_ref() {
|
||||
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
|
||||
Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
|
||||
Some(TenantSlot::Secondary) => {
|
||||
// TODO: when adding secondary mode tenants, this will check for shutdown
|
||||
// in the same way that we do for `Tenant` above
|
||||
true
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// A SlotGuard cannot be constructed for a slot that was already InProgress
|
||||
unreachable!()
|
||||
@@ -2054,19 +1992,26 @@ where
|
||||
let mut slot_guard =
|
||||
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// allow pageserver shutdown to await for our completion
|
||||
let (_guard, progress) = completion::channel();
|
||||
|
||||
// The SlotGuard allows us to manipulate the Tenant object without fear of some
|
||||
// concurrent API request doing something else for the same tenant ID.
|
||||
let attached_tenant = match slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
Some(TenantSlot::Attached(t)) => Some(t),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// allow pageserver shutdown to await for our completion
|
||||
let (_guard, progress) = completion::channel();
|
||||
|
||||
// If the tenant was attached, shut it down gracefully. For secondary
|
||||
// locations this part is not necessary
|
||||
match &attached_tenant {
|
||||
Some(attached_tenant) => {
|
||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||
let freeze_and_flush = false;
|
||||
|
||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||
// that we can continue safely to cleanup.
|
||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
||||
match attached_tenant.shutdown(progress, freeze_and_flush).await {
|
||||
Ok(()) => {}
|
||||
Err(_other) => {
|
||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||
@@ -2075,19 +2020,11 @@ where
|
||||
return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
|
||||
}
|
||||
}
|
||||
Some(tenant)
|
||||
}
|
||||
Some(TenantSlot::Secondary(secondary_state)) => {
|
||||
tracing::info!("Shutting down in secondary mode");
|
||||
secondary_state.shutdown().await;
|
||||
None
|
||||
None => {
|
||||
// Nothing to wait on when not attached, proceed.
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// Acquiring a slot guarantees its old value was not InProgress
|
||||
unreachable!();
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
}
|
||||
|
||||
match tenant_cleanup
|
||||
.await
|
||||
|
||||
@@ -229,7 +229,6 @@ use crate::{
|
||||
tenant::upload_queue::{
|
||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||
},
|
||||
TENANT_HEATMAP_BASENAME,
|
||||
};
|
||||
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -819,25 +818,8 @@ impl RemoteTimelineClient {
|
||||
fn schedule_deletion_of_unlinked0(
|
||||
self: &Arc<Self>,
|
||||
upload_queue: &mut UploadQueueInitialized,
|
||||
mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||
) {
|
||||
// Filter out any layers which were not created by this tenant shard. These are
|
||||
// layers that originate from some ancestor shard after a split, and may still
|
||||
// be referenced by other shards. We are free to delete them locally and remove
|
||||
// them from our index (and would have already done so when we reach this point
|
||||
// in the code), but we may not delete them remotely.
|
||||
with_metadata.retain(|(name, meta)| {
|
||||
let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
|
||||
&& meta.shard.shard_count == self.tenant_shard_id.shard_count;
|
||||
if !retain {
|
||||
tracing::debug!(
|
||||
"Skipping deletion of ancestor-shard layer {name}, from shard {}",
|
||||
meta.shard
|
||||
);
|
||||
}
|
||||
retain
|
||||
});
|
||||
|
||||
for (name, meta) in &with_metadata {
|
||||
info!(
|
||||
"scheduling deletion of layer {}{} (shard {})",
|
||||
@@ -1742,11 +1724,11 @@ pub fn remote_index_path(
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
|
||||
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
RemotePath::from_string(&format!(
|
||||
"tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
|
||||
))
|
||||
.expect("Failed to construct path")
|
||||
RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
|
||||
.expect("Failed to construct path")
|
||||
}
|
||||
|
||||
/// Given the key of an index, parse out the generation part of the name
|
||||
@@ -1903,7 +1885,7 @@ mod tests {
|
||||
fn span(&self) -> tracing::Span {
|
||||
tracing::info_span!(
|
||||
"test",
|
||||
tenant_id = %self.harness.tenant_shard_id.tenant_id,
|
||||
tenant_id = %self.harness.tenant_id,
|
||||
timeline_id = %TIMELINE_ID
|
||||
)
|
||||
}
|
||||
@@ -2210,6 +2192,15 @@ mod tests {
|
||||
|
||||
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
|
||||
|
||||
let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
|
||||
let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
|
||||
timeline_path
|
||||
.strip_prefix(&test_state.harness.conf.workdir)
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
|
||||
|
||||
let index_path = test_state.harness.remote_fs_dir.join(
|
||||
remote_index_path(
|
||||
&test_state.harness.tenant_shard_id,
|
||||
@@ -2218,10 +2209,6 @@ mod tests {
|
||||
)
|
||||
.get_path(),
|
||||
);
|
||||
|
||||
std::fs::create_dir_all(index_path.parent().unwrap())
|
||||
.expect("creating test dir should work");
|
||||
|
||||
eprintln!("Writing {index_path}");
|
||||
std::fs::write(&index_path, index_part_bytes).unwrap();
|
||||
example_index_part
|
||||
|
||||
@@ -1,48 +1,24 @@
|
||||
mod downloader;
|
||||
pub mod heatmap;
|
||||
mod heatmap_uploader;
|
||||
mod scheduler;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
|
||||
use self::{
|
||||
downloader::{downloader_task, SecondaryDetail},
|
||||
heatmap_uploader::heatmap_uploader_task,
|
||||
};
|
||||
use self::heatmap_uploader::heatmap_uploader_task;
|
||||
|
||||
use super::{config::SecondaryLocationConfig, mgr::TenantManager};
|
||||
use super::mgr::TenantManager;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{completion::Barrier, sync::gate::Gate};
|
||||
use utils::completion::Barrier;
|
||||
|
||||
enum DownloadCommand {
|
||||
Download(TenantShardId),
|
||||
}
|
||||
enum UploadCommand {
|
||||
Upload(TenantShardId),
|
||||
}
|
||||
|
||||
impl UploadCommand {
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||
match self {
|
||||
Self::Upload(id) => id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DownloadCommand {
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||
match self {
|
||||
Self::Download(id) => id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct CommandRequest<T> {
|
||||
payload: T,
|
||||
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
|
||||
@@ -52,73 +28,12 @@ struct CommandResponse {
|
||||
result: anyhow::Result<()>,
|
||||
}
|
||||
|
||||
// Whereas [`Tenant`] represents an attached tenant, this type represents the work
|
||||
// we do for secondary tenant locations: where we are not serving clients or
|
||||
// ingesting WAL, but we are maintaining a warm cache of layer files.
|
||||
//
|
||||
// This type is all about the _download_ path for secondary mode. The upload path
|
||||
// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
|
||||
//
|
||||
// This structure coordinates TenantManager and SecondaryDownloader,
|
||||
// so that the downloader can indicate which tenants it is currently
|
||||
// operating on, and the manager can indicate when a particular
|
||||
// secondary tenant should cancel any work in flight.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SecondaryTenant {
|
||||
/// Carrying a tenant shard ID simplifies callers such as the downloader
|
||||
/// which need to organize many of these objects by ID.
|
||||
tenant_shard_id: TenantShardId,
|
||||
|
||||
/// Cancellation token indicates to SecondaryDownloader that it should stop doing
|
||||
/// any work for this tenant at the next opportunity.
|
||||
pub(crate) cancel: CancellationToken,
|
||||
|
||||
pub(crate) gate: Gate,
|
||||
|
||||
detail: std::sync::Mutex<SecondaryDetail>,
|
||||
}
|
||||
|
||||
impl SecondaryTenant {
|
||||
pub(crate) fn new(
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: &SecondaryLocationConfig,
|
||||
) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
tenant_shard_id,
|
||||
// todo: shall we make this a descendent of the
|
||||
// main cancellation token, or is it sufficient that
|
||||
// on shutdown we walk the tenants and fire their
|
||||
// individual cancellations?
|
||||
cancel: CancellationToken::new(),
|
||||
gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
|
||||
|
||||
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
self.cancel.cancel();
|
||||
|
||||
// Wait for any secondary downloader work to complete
|
||||
self.gate.close().await;
|
||||
}
|
||||
|
||||
pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
|
||||
self.detail.lock().unwrap().config = config.clone();
|
||||
}
|
||||
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||
&self.tenant_shard_id
|
||||
}
|
||||
}
|
||||
|
||||
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
||||
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
|
||||
/// where we want to immediately upload/download for a particular tenant. In normal operation
|
||||
/// uploads & downloads are autonomous and not driven by this interface.
|
||||
pub struct SecondaryController {
|
||||
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
||||
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
|
||||
}
|
||||
|
||||
impl SecondaryController {
|
||||
@@ -148,13 +63,6 @@ impl SecondaryController {
|
||||
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
|
||||
.await
|
||||
}
|
||||
pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
self.dispatch(
|
||||
&self.download_req_tx,
|
||||
DownloadCommand::Download(tenant_shard_id),
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn_tasks(
|
||||
@@ -163,37 +71,9 @@ pub fn spawn_tasks(
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) -> SecondaryController {
|
||||
let mgr_clone = tenant_manager.clone();
|
||||
let storage_clone = remote_storage.clone();
|
||||
let cancel_clone = cancel.clone();
|
||||
let bg_jobs_clone = background_jobs_can_start.clone();
|
||||
|
||||
let (download_req_tx, download_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
|
||||
let (upload_req_tx, upload_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::SecondaryDownloads,
|
||||
None,
|
||||
None,
|
||||
"secondary tenant downloads",
|
||||
false,
|
||||
async move {
|
||||
downloader_task(
|
||||
mgr_clone,
|
||||
storage_clone,
|
||||
download_req_rx,
|
||||
bg_jobs_clone,
|
||||
cancel_clone,
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::SecondaryUploads,
|
||||
@@ -209,26 +89,16 @@ pub fn spawn_tasks(
|
||||
background_jobs_can_start,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(())
|
||||
.await
|
||||
},
|
||||
);
|
||||
|
||||
SecondaryController {
|
||||
download_req_tx,
|
||||
upload_req_tx,
|
||||
}
|
||||
SecondaryController { upload_req_tx }
|
||||
}
|
||||
|
||||
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
|
||||
pub fn null_controller() -> SecondaryController {
|
||||
let (download_req_tx, _download_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
|
||||
let (upload_req_tx, _upload_req_rx) =
|
||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||
SecondaryController {
|
||||
upload_req_tx,
|
||||
download_req_tx,
|
||||
}
|
||||
SecondaryController { upload_req_tx }
|
||||
}
|
||||
|
||||
@@ -1,800 +0,0 @@
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
pin::Pin,
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant, SystemTime},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::SECONDARY_MODE,
|
||||
tenant::{
|
||||
config::SecondaryLocationConfig,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
remote_timeline_client::{
|
||||
index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
|
||||
},
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::LayerFileName,
|
||||
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||
},
|
||||
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
|
||||
METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
|
||||
};
|
||||
|
||||
use super::{
|
||||
heatmap::HeatMapLayer,
|
||||
scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
|
||||
SecondaryTenant,
|
||||
};
|
||||
|
||||
use crate::tenant::{
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
|
||||
};
|
||||
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
||||
};
|
||||
|
||||
use super::{
|
||||
heatmap::{HeatMapTenant, HeatMapTimeline},
|
||||
CommandRequest, DownloadCommand,
|
||||
};
|
||||
|
||||
/// For each tenant, how long must have passed since the last download_tenant call before
|
||||
/// calling it again. This is approximately the time by which local data is allowed
|
||||
/// to fall behind remote data.
|
||||
///
|
||||
/// TODO: this should just be a default, and the actual period should be controlled
|
||||
/// via the heatmap itself
|
||||
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
|
||||
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
|
||||
pub(super) async fn downloader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
|
||||
|
||||
let generator = SecondaryDownloader {
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
};
|
||||
let mut scheduler = Scheduler::new(generator, concurrency);
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("secondary_downloads"))
|
||||
.await
|
||||
}
|
||||
|
||||
struct SecondaryDownloader {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(super) struct OnDiskState {
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
}
|
||||
|
||||
impl OnDiskState {
|
||||
fn new(
|
||||
_conf: &'static PageServerConf,
|
||||
_tenant_shard_id: &TenantShardId,
|
||||
_imeline_id: &TimelineId,
|
||||
_ame: LayerFileName,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
) -> Self {
|
||||
Self {
|
||||
metadata,
|
||||
access_time,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(super) struct SecondaryDetailTimeline {
|
||||
pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
|
||||
|
||||
/// We remember when layers were evicted, to prevent re-downloading them.
|
||||
pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
|
||||
}
|
||||
|
||||
/// This state is written by the secondary downloader, it is opaque
|
||||
/// to TenantManager
|
||||
#[derive(Debug)]
|
||||
pub(super) struct SecondaryDetail {
|
||||
pub(super) config: SecondaryLocationConfig,
|
||||
|
||||
last_download: Option<Instant>,
|
||||
next_download: Option<Instant>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
|
||||
/// Helper for logging SystemTime
|
||||
fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
|
||||
let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
|
||||
datetime.format("%d/%m/%Y %T")
|
||||
}
|
||||
|
||||
impl SecondaryDetail {
|
||||
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
last_download: None,
|
||||
next_download: None,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct PendingDownload {
|
||||
secondary_state: Arc<SecondaryTenant>,
|
||||
last_download: Option<Instant>,
|
||||
target_time: Option<Instant>,
|
||||
period: Option<Duration>,
|
||||
}
|
||||
|
||||
impl scheduler::PendingJob for PendingDownload {
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||
self.secondary_state.get_tenant_shard_id()
|
||||
}
|
||||
}
|
||||
|
||||
struct RunningDownload {
|
||||
barrier: Barrier,
|
||||
}
|
||||
|
||||
impl scheduler::RunningJob for RunningDownload {
|
||||
fn get_barrier(&self) -> Barrier {
|
||||
self.barrier.clone()
|
||||
}
|
||||
}
|
||||
|
||||
struct CompleteDownload {
|
||||
secondary_state: Arc<SecondaryTenant>,
|
||||
completed_at: Instant,
|
||||
}
|
||||
|
||||
impl scheduler::Completion for CompleteDownload {
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||
self.secondary_state.get_tenant_shard_id()
|
||||
}
|
||||
}
|
||||
|
||||
type Scheduler = TenantBackgroundJobs<
|
||||
SecondaryDownloader,
|
||||
PendingDownload,
|
||||
RunningDownload,
|
||||
CompleteDownload,
|
||||
DownloadCommand,
|
||||
>;
|
||||
|
||||
impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
|
||||
for SecondaryDownloader
|
||||
{
|
||||
#[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
|
||||
fn on_completion(&mut self, completion: CompleteDownload) {
|
||||
let CompleteDownload {
|
||||
secondary_state,
|
||||
completed_at: _completed_at,
|
||||
} = completion;
|
||||
|
||||
tracing::debug!("Secondary tenant download completed");
|
||||
|
||||
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
|
||||
// take priority to run again.
|
||||
let mut detail = secondary_state.detail.lock().unwrap();
|
||||
detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
|
||||
}
|
||||
|
||||
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
|
||||
let mut result = SchedulingResult {
|
||||
jobs: Vec::new(),
|
||||
want_interval: None,
|
||||
};
|
||||
|
||||
// Step 1: identify some tenants that we may work on
|
||||
let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
|
||||
self.tenant_manager
|
||||
.foreach_secondary_tenants(|_id, secondary_state| {
|
||||
tenants.push(secondary_state.clone());
|
||||
});
|
||||
|
||||
// Step 2: filter out tenants which are not yet elegible to run
|
||||
let now = Instant::now();
|
||||
result.jobs = tenants
|
||||
.into_iter()
|
||||
.filter_map(|secondary_tenant| {
|
||||
let (last_download, next_download) = {
|
||||
let mut detail = secondary_tenant.detail.lock().unwrap();
|
||||
|
||||
if !detail.config.warm {
|
||||
// Downloads are disabled for this tenant
|
||||
detail.next_download = None;
|
||||
return None;
|
||||
}
|
||||
|
||||
if detail.next_download.is_none() {
|
||||
// Initialize with a jitter: this spreads initial downloads on startup
|
||||
// or mass-attach across our freshen interval.
|
||||
let jittered_period =
|
||||
rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
|
||||
detail.next_download = Some(now.checked_add(jittered_period).expect(
|
||||
"Using our constant, which is known to be small compared with clock range",
|
||||
));
|
||||
}
|
||||
(detail.last_download, detail.next_download.unwrap())
|
||||
};
|
||||
|
||||
if now < next_download {
|
||||
Some(PendingDownload {
|
||||
secondary_state: secondary_tenant,
|
||||
last_download,
|
||||
target_time: Some(next_download),
|
||||
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Step 3: sort by target execution time to run most urgent first.
|
||||
result.jobs.sort_by_key(|j| j.target_time);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
|
||||
let tenant_shard_id = command.get_tenant_shard_id();
|
||||
|
||||
let tenant = self
|
||||
.tenant_manager
|
||||
.get_secondary_tenant_shard(*tenant_shard_id);
|
||||
let Some(tenant) = tenant else {
|
||||
{
|
||||
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(PendingDownload {
|
||||
target_time: None,
|
||||
period: None,
|
||||
last_download: None,
|
||||
secondary_state: tenant,
|
||||
})
|
||||
}
|
||||
|
||||
fn spawn(
|
||||
&mut self,
|
||||
job: PendingDownload,
|
||||
) -> (
|
||||
RunningDownload,
|
||||
Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
|
||||
) {
|
||||
let PendingDownload {
|
||||
secondary_state,
|
||||
last_download,
|
||||
target_time,
|
||||
period,
|
||||
} = job;
|
||||
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let remote_storage = self.remote_storage.clone();
|
||||
let conf = self.tenant_manager.get_conf();
|
||||
let tenant_shard_id = *secondary_state.get_tenant_shard_id();
|
||||
(RunningDownload { barrier }, Box::pin(async move {
|
||||
let _completion = completion;
|
||||
|
||||
match TenantDownloader::new(conf, &remote_storage, &secondary_state)
|
||||
.download()
|
||||
.await
|
||||
{
|
||||
Err(UpdateError::NoData) => {
|
||||
tracing::info!("No heatmap found for tenant. This is fine if it is new.");
|
||||
},
|
||||
Err(UpdateError::NoSpace) => {
|
||||
tracing::warn!("Insufficient space while downloading. Will retry later.");
|
||||
}
|
||||
Err(UpdateError::Cancelled) => {
|
||||
tracing::debug!("Shut down while downloading");
|
||||
},
|
||||
Err(UpdateError::Deserialize(e)) => {
|
||||
tracing::error!("Corrupt content while downloading tenant: {e}");
|
||||
},
|
||||
Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
|
||||
tracing::error!("Error while downloading tenant: {e}");
|
||||
},
|
||||
Ok(()) => {}
|
||||
};
|
||||
|
||||
// Irrespective of the result, we will reschedule ourselves to run after our usual period.
|
||||
|
||||
// If the job had a target execution time, we may check our final execution
|
||||
// time against that for observability purposes.
|
||||
if let (Some(target_time), Some(period)) = (target_time, period) {
|
||||
// Only track execution lag if this isn't our first download: otherwise, it is expected
|
||||
// that execution will have taken longer than our configured interval, for example
|
||||
// when starting up a pageserver and
|
||||
if last_download.is_some() {
|
||||
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||
let elapsed = Instant::now().duration_since(target_time);
|
||||
|
||||
warn_when_period_overrun(
|
||||
elapsed,
|
||||
period,
|
||||
BackgroundLoopKind::SecondaryDownload,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
CompleteDownload {
|
||||
secondary_state,
|
||||
completed_at: Instant::now(),
|
||||
}
|
||||
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||
}
|
||||
}
|
||||
|
||||
/// This type is a convenience to group together the various functions involved in
|
||||
/// freshening a secondary tenant.
|
||||
struct TenantDownloader<'a> {
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: &'a GenericRemoteStorage,
|
||||
secondary_state: &'a SecondaryTenant,
|
||||
}
|
||||
|
||||
/// Errors that may be encountered while updating a tenant
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum UpdateError {
|
||||
#[error("No remote data found")]
|
||||
NoData,
|
||||
#[error("Insufficient local storage space")]
|
||||
NoSpace,
|
||||
#[error("Failed to download")]
|
||||
DownloadError(DownloadError),
|
||||
#[error(transparent)]
|
||||
Deserialize(#[from] serde_json::Error),
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<DownloadError> for UpdateError {
|
||||
fn from(value: DownloadError) -> Self {
|
||||
match &value {
|
||||
DownloadError::Cancelled => Self::Cancelled,
|
||||
DownloadError::NotFound => Self::NoData,
|
||||
_ => Self::DownloadError(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for UpdateError {
|
||||
fn from(value: std::io::Error) -> Self {
|
||||
if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
|
||||
UpdateError::NoSpace
|
||||
} else {
|
||||
// An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
|
||||
UpdateError::Other(anyhow::anyhow!(value))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TenantDownloader<'a> {
|
||||
fn new(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: &'a GenericRemoteStorage,
|
||||
secondary_state: &'a SecondaryTenant,
|
||||
) -> Self {
|
||||
Self {
|
||||
conf,
|
||||
remote_storage,
|
||||
secondary_state,
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(&self) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
// For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
|
||||
// cover our access to local storage.
|
||||
let Ok(_guard) = self.secondary_state.gate.enter() else {
|
||||
// Shutting down
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
// Download the tenant's heatmap
|
||||
let heatmap_bytes = tokio::select!(
|
||||
bytes = self.download_heatmap() => {bytes?},
|
||||
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||
);
|
||||
|
||||
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
|
||||
|
||||
// Save the heatmap: this will be useful on restart, allowing us to reconstruct
|
||||
// layer metadata without having to re-download it.
|
||||
let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
|
||||
|
||||
let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
|
||||
let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
|
||||
let heatmap_path_bg = heatmap_path.clone();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
tokio::runtime::Handle::current().block_on(async move {
|
||||
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
|
||||
})
|
||||
})
|
||||
.await
|
||||
.expect("Blocking task is never aborted")
|
||||
.maybe_fatal_err(&context_msg)?;
|
||||
|
||||
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
self.download_timeline(timeline)
|
||||
.instrument(tracing::info_span!(
|
||||
"secondary_download_timeline",
|
||||
tenant_id=%tenant_shard_id.tenant_id,
|
||||
shard_id=%tenant_shard_id.shard_slug(),
|
||||
%timeline_id
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
// TODO: make download conditional on ETag having changed since last download
|
||||
// (https://github.com/neondatabase/neon/issues/6199)
|
||||
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||
|
||||
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||
|
||||
let heatmap_bytes = backoff::retry(
|
||||
|| async {
|
||||
let download = self
|
||||
.remote_storage
|
||||
.download(&heatmap_path)
|
||||
.await
|
||||
.map_err(UpdateError::from)?;
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
|
||||
Ok(heatmap_bytes)
|
||||
},
|
||||
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"download heatmap",
|
||||
backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
|
||||
UpdateError::Cancelled
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
|
||||
Ok(heatmap_bytes)
|
||||
}
|
||||
|
||||
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(tenant_shard_id, &timeline.timeline_id);
|
||||
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
// Clone a view of what layers already exist on disk
|
||||
let timeline_state = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.get(&timeline.timeline_id)
|
||||
.cloned();
|
||||
|
||||
let timeline_state = match timeline_state {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
// We have no existing state: need to scan local disk for layers first.
|
||||
let timeline_state =
|
||||
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
|
||||
|
||||
// Re-acquire detail lock now that we're done with async load from local FS
|
||||
self.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
.insert(timeline.timeline_id, timeline_state.clone());
|
||||
timeline_state
|
||||
}
|
||||
};
|
||||
|
||||
let layers_in_heatmap = timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
let local_path = timeline_path.join(layer.to_string());
|
||||
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary layer")?;
|
||||
}
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Existing on-disk layers: just update their access time.
|
||||
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||
tracing::debug!("Layer {} is already on disk", layer.name);
|
||||
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||
|| on_disk.access_time != layer.access_time
|
||||
{
|
||||
// We already have this layer on disk. Update its access time.
|
||||
tracing::debug!(
|
||||
"Access time updated for layer {}: {} -> {}",
|
||||
layer.name,
|
||||
strftime(&on_disk.access_time),
|
||||
strftime(&layer.access_time)
|
||||
);
|
||||
touched.push(layer);
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
tracing::debug!("Layer {} not present on disk yet", layer.name);
|
||||
}
|
||||
|
||||
// Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
|
||||
// recently than it was evicted.
|
||||
if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
|
||||
if &layer.access_time > evicted_at {
|
||||
tracing::info!(
|
||||
"Re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||
layer.name,
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
} else {
|
||||
tracing::trace!(
|
||||
"Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||
layer.name,
|
||||
strftime(&layer.access_time),
|
||||
strftime(evicted_at)
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
self.remote_storage,
|
||||
*tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
&layer.name,
|
||||
&LayerFileMetadata::from(&layer.metadata),
|
||||
&self.secondary_state.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(bytes) => bytes,
|
||||
Err(e) => {
|
||||
if let DownloadError::NotFound = e {
|
||||
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||
// This is harmless: continue to download the next layer. It is expected during compaction
|
||||
// GC.
|
||||
tracing::debug!(
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
continue;
|
||||
} else {
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if downloaded_bytes != layer.metadata.file_size {
|
||||
let local_path = timeline_path.join(layer.name.to_string());
|
||||
|
||||
tracing::warn!(
|
||||
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
|
||||
layer.name,
|
||||
downloaded_bytes,
|
||||
layer.metadata.file_size
|
||||
);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)?;
|
||||
}
|
||||
|
||||
SECONDARY_MODE.download_layer.inc();
|
||||
touched.push(layer)
|
||||
}
|
||||
|
||||
// Write updates to state to record layers we just downloaded or touched.
|
||||
{
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
|
||||
|
||||
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
|
||||
|
||||
for t in touched {
|
||||
use std::collections::hash_map::Entry;
|
||||
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
|
||||
Entry::Occupied(mut v) => {
|
||||
v.get_mut().access_time = t.access_time;
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
e.insert(OnDiskState::new(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&timeline.timeline_id,
|
||||
t.name,
|
||||
LayerFileMetadata::from(&t.metadata),
|
||||
t.access_time,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
|
||||
async fn init_timeline_state(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
heatmap: &HeatMapTimeline,
|
||||
) -> SecondaryDetailTimeline {
|
||||
let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
|
||||
let mut detail = SecondaryDetailTimeline::default();
|
||||
|
||||
let mut dir = match tokio::fs::read_dir(&timeline_path).await {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
let context = format!("Creating timeline directory {timeline_path}");
|
||||
tracing::info!("{}", context);
|
||||
tokio::fs::create_dir_all(&timeline_path)
|
||||
.await
|
||||
.fatal_err(&context);
|
||||
|
||||
// No entries to report: drop out.
|
||||
return detail;
|
||||
} else {
|
||||
on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// As we iterate through layers found on disk, we will look up their metadata from this map.
|
||||
// Layers not present in metadata will be discarded.
|
||||
let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
|
||||
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
|
||||
|
||||
while let Some(dentry) = dir
|
||||
.next_entry()
|
||||
.await
|
||||
.fatal_err(&format!("Listing {timeline_path}"))
|
||||
{
|
||||
let dentry_file_name = dentry.file_name();
|
||||
let file_name = dentry_file_name.to_string_lossy();
|
||||
let local_meta = dentry.metadata().await.fatal_err(&format!(
|
||||
"Read metadata on {}",
|
||||
dentry.path().to_string_lossy()
|
||||
));
|
||||
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
continue;
|
||||
}
|
||||
|
||||
match LayerFileName::from_str(&file_name) {
|
||||
Ok(name) => {
|
||||
let remote_meta = heatmap_metadata.get(&name);
|
||||
match remote_meta {
|
||||
Some(remote_meta) => {
|
||||
// TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
|
||||
if local_meta.len() != remote_meta.metadata.file_size {
|
||||
// This should not happen, because we do crashsafe write-then-rename when downloading
|
||||
// layers, and layers in remote storage are immutable. Remove the local file because
|
||||
// we cannot trust it.
|
||||
tracing::warn!(
|
||||
"Removing local layer {name} with unexpected local size {} != {}",
|
||||
local_meta.len(),
|
||||
remote_meta.metadata.file_size
|
||||
);
|
||||
} else {
|
||||
// We expect the access time to be initialized immediately afterwards, when
|
||||
// the latest heatmap is applied to the state.
|
||||
detail.on_disk_layers.insert(
|
||||
name.clone(),
|
||||
OnDiskState::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&heatmap.timeline_id,
|
||||
name,
|
||||
LayerFileMetadata::from(&remote_meta.metadata),
|
||||
remote_meta.access_time,
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// FIXME: consider some optimization when transitioning from attached to secondary: maybe
|
||||
// wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise
|
||||
// we will end up deleting any layers which were created+uploaded more recently than the heatmap.
|
||||
tracing::info!(
|
||||
"Removing secondary local layer {} because it's absent in heatmap",
|
||||
name
|
||||
);
|
||||
tokio::fs::remove_file(&dentry.path())
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err(&format!(
|
||||
"Removing layer {}",
|
||||
dentry.path().to_string_lossy()
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
// Ignore it.
|
||||
tracing::warn!("Unexpected file in timeline directory: {file_name}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
detail
|
||||
}
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
pin::Pin,
|
||||
sync::{Arc, Weak},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
@@ -8,86 +7,35 @@ use std::{
|
||||
use crate::{
|
||||
metrics::SECONDARY_MODE,
|
||||
tenant::{
|
||||
config::AttachmentMode,
|
||||
mgr::TenantManager,
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||
Tenant,
|
||||
config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
|
||||
secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
use futures::Future;
|
||||
use md5;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use super::{
|
||||
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
|
||||
CommandRequest,
|
||||
};
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
|
||||
use tracing::instrument;
|
||||
use utils::{backoff, completion::Barrier};
|
||||
|
||||
use super::{heatmap::HeatMapTenant, UploadCommand};
|
||||
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
|
||||
|
||||
pub(super) async fn heatmap_uploader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
|
||||
|
||||
let generator = HeatmapUploader {
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
cancel: cancel.clone(),
|
||||
tenants: HashMap::new(),
|
||||
};
|
||||
let mut scheduler = Scheduler::new(generator, concurrency);
|
||||
|
||||
scheduler
|
||||
.run(command_queue, background_jobs_can_start, cancel)
|
||||
.instrument(info_span!("heatmap_uploader"))
|
||||
.await
|
||||
}
|
||||
|
||||
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
|
||||
/// handling loop and mutates it as needed: there are no locks here, because that event loop
|
||||
/// can hold &mut references to this type throughout.
|
||||
struct HeatmapUploader {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
|
||||
tenants: HashMap<TenantShardId, UploaderTenantState>,
|
||||
}
|
||||
/// Period between heatmap uploader walking Tenants to look for work to do.
|
||||
/// If any tenants have a heatmap upload period lower than this, it will be adjusted
|
||||
/// downward to match.
|
||||
const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
|
||||
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
|
||||
|
||||
struct WriteInProgress {
|
||||
barrier: Barrier,
|
||||
}
|
||||
|
||||
impl RunningJob for WriteInProgress {
|
||||
fn get_barrier(&self) -> Barrier {
|
||||
self.barrier.clone()
|
||||
}
|
||||
}
|
||||
|
||||
struct UploadPending {
|
||||
tenant: Arc<Tenant>,
|
||||
last_digest: Option<md5::Digest>,
|
||||
target_time: Option<Instant>,
|
||||
period: Option<Duration>,
|
||||
}
|
||||
|
||||
impl scheduler::PendingJob for UploadPending {
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||
self.tenant.get_tenant_shard_id()
|
||||
}
|
||||
}
|
||||
|
||||
struct WriteComplete {
|
||||
@@ -97,12 +45,6 @@ struct WriteComplete {
|
||||
next_upload: Option<Instant>,
|
||||
}
|
||||
|
||||
impl scheduler::Completion for WriteComplete {
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||
&self.tenant_shard_id
|
||||
}
|
||||
}
|
||||
|
||||
/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
|
||||
/// when we last did a write. We only populate this after doing at least one
|
||||
/// write for a tenant -- this avoids holding state for tenants that have
|
||||
@@ -126,110 +68,267 @@ struct UploaderTenantState {
|
||||
next_upload: Option<Instant>,
|
||||
}
|
||||
|
||||
type Scheduler = TenantBackgroundJobs<
|
||||
HeatmapUploader,
|
||||
UploadPending,
|
||||
WriteInProgress,
|
||||
WriteComplete,
|
||||
UploadCommand,
|
||||
>;
|
||||
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
|
||||
/// handling loop and mutates it as needed: there are no locks here, because that event loop
|
||||
/// can hold &mut references to this type throughout.
|
||||
struct HeatmapUploader {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
cancel: CancellationToken,
|
||||
|
||||
impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
for HeatmapUploader
|
||||
{
|
||||
async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
|
||||
tenants: HashMap<TenantShardId, UploaderTenantState>,
|
||||
|
||||
/// Tenants with work to do, for which tasks should be spawned as soon as concurrency
|
||||
/// limits permit it.
|
||||
tenants_pending: std::collections::VecDeque<UploadPending>,
|
||||
|
||||
/// Tenants for which a task in `tasks` has been spawned.
|
||||
tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
|
||||
|
||||
tasks: JoinSet<()>,
|
||||
|
||||
/// Channel for our child tasks to send results to: we use a channel for results rather than
|
||||
/// just getting task results via JoinSet because we need the channel's recv() "sleep until something
|
||||
/// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
|
||||
/// behavior.
|
||||
task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
|
||||
task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
|
||||
|
||||
concurrent_uploads: usize,
|
||||
|
||||
scheduling_interval: Duration,
|
||||
}
|
||||
|
||||
/// The uploader task runs a loop that periodically wakes up and schedules tasks for
|
||||
/// tenants that require an upload, or handles any commands that have been sent into
|
||||
/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we
|
||||
/// spawn.
|
||||
///
|
||||
/// Scheduling iterations are somewhat infrequent. However, each one will enqueue
|
||||
/// all tenants that require an upload, and in between scheduling iterations we will
|
||||
/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
|
||||
///
|
||||
/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
|
||||
/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
|
||||
/// we might block waiting on a Tenant.
|
||||
pub(super) async fn heatmap_uploader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
|
||||
|
||||
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let mut uploader = HeatmapUploader {
|
||||
tenant_manager,
|
||||
remote_storage,
|
||||
cancel: cancel.clone(),
|
||||
tasks: JoinSet::new(),
|
||||
tenants: HashMap::new(),
|
||||
tenants_pending: std::collections::VecDeque::new(),
|
||||
tenants_uploading: HashMap::new(),
|
||||
task_result_tx: result_tx,
|
||||
task_result_rx: result_rx,
|
||||
concurrent_uploads,
|
||||
scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
|
||||
};
|
||||
|
||||
tracing::info!("Waiting for background_jobs_can start...");
|
||||
background_jobs_can_start.wait().await;
|
||||
tracing::info!("background_jobs_can is ready, proceeding.");
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
// Look for new work: this is relatively expensive because we have to go acquire the lock on
|
||||
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
|
||||
// require an upload.
|
||||
uploader.schedule_iteration().await?;
|
||||
|
||||
// Between scheduling iterations, we will:
|
||||
// - Drain any complete tasks and spawn pending tasks
|
||||
// - Handle incoming administrative commands
|
||||
// - Check our cancellation token
|
||||
let next_scheduling_iteration = Instant::now()
|
||||
.checked_add(uploader.scheduling_interval)
|
||||
.unwrap_or_else(|| {
|
||||
tracing::warn!(
|
||||
"Scheduling interval invalid ({}s), running immediately!",
|
||||
uploader.scheduling_interval.as_secs_f64()
|
||||
);
|
||||
Instant::now()
|
||||
});
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
|
||||
tracing::info!("Heatmap uploader joining tasks");
|
||||
while let Some(_r) = uploader.tasks.join_next().await {};
|
||||
tracing::info!("Heatmap uploader terminating");
|
||||
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
|
||||
tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
|
||||
break;},
|
||||
cmd = command_queue.recv() => {
|
||||
tracing::debug!("heatmap_uploader_task: woke for command queue");
|
||||
let cmd = match cmd {
|
||||
Some(c) =>c,
|
||||
None => {
|
||||
// SecondaryController was destroyed, and this has raced with
|
||||
// our CancellationToken
|
||||
tracing::info!("Heatmap uploader terminating");
|
||||
cancel.cancel();
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
let CommandRequest{
|
||||
response_tx,
|
||||
payload
|
||||
} = cmd;
|
||||
uploader.handle_command(payload, response_tx);
|
||||
},
|
||||
_ = uploader.process_next_completion() => {
|
||||
if !cancel.is_cancelled() {
|
||||
uploader.spawn_pending();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl HeatmapUploader {
|
||||
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
|
||||
async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
|
||||
// Cull any entries in self.tenants whose Arc<Tenant> is gone
|
||||
self.tenants
|
||||
.retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
|
||||
|
||||
// The priority order of previously scheduled work may be invalidated by current state: drop
|
||||
// all pending work (it will be re-scheduled if still needed)
|
||||
self.tenants_pending.clear();
|
||||
|
||||
// Used a fixed 'now' through the following loop, for efficiency and fairness.
|
||||
let now = Instant::now();
|
||||
|
||||
let mut result = SchedulingResult {
|
||||
jobs: Vec::new(),
|
||||
want_interval: None,
|
||||
};
|
||||
// While iterating over the potentially-long list of tenants, we will periodically yield
|
||||
// to avoid blocking executor.
|
||||
const YIELD_ITERATIONS: usize = 1000;
|
||||
|
||||
// Iterate over tenants looking for work to do.
|
||||
let tenants = self.tenant_manager.get_attached_active_tenant_shards();
|
||||
|
||||
yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
|
||||
let period = match tenant.get_heatmap_period() {
|
||||
None => {
|
||||
// Heatmaps are disabled for this tenant
|
||||
return;
|
||||
}
|
||||
Some(period) => {
|
||||
// If any tenant has asked for uploads more frequent than our scheduling interval,
|
||||
// reduce it to match so that we can keep up. This is mainly useful in testing, where
|
||||
// we may set rather short intervals.
|
||||
result.want_interval = match result.want_interval {
|
||||
None => Some(period),
|
||||
Some(existing) => Some(std::cmp::min(period, existing)),
|
||||
};
|
||||
|
||||
period
|
||||
}
|
||||
};
|
||||
|
||||
// Stale attachments do not upload anything: if we are in this state, there is probably some
|
||||
// other attachment in mode Single or Multi running on another pageserver, and we don't
|
||||
// want to thrash and overwrite their heatmap uploads.
|
||||
if tenant.get_attach_mode() == AttachmentMode::Stale {
|
||||
return;
|
||||
for (i, tenant) in tenants.into_iter().enumerate() {
|
||||
// Process is shutting down, drop out
|
||||
if self.cancel.is_cancelled() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
|
||||
// with the completion time in on_completion.
|
||||
let state = self
|
||||
.tenants
|
||||
.entry(*tenant.get_tenant_shard_id())
|
||||
.or_insert_with(|| {
|
||||
let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
|
||||
|
||||
UploaderTenantState {
|
||||
tenant: Arc::downgrade(&tenant),
|
||||
last_upload: None,
|
||||
next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
|
||||
last_digest: None,
|
||||
}
|
||||
});
|
||||
|
||||
// Decline to do the upload if insufficient time has passed
|
||||
if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
|
||||
return;
|
||||
// Skip tenants that already have a write in flight
|
||||
if self
|
||||
.tenants_uploading
|
||||
.contains_key(tenant.get_tenant_shard_id())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let last_digest = state.last_digest;
|
||||
result.jobs.push(UploadPending {
|
||||
tenant,
|
||||
last_digest,
|
||||
target_time: state.next_upload,
|
||||
period: Some(period),
|
||||
});
|
||||
})
|
||||
.await
|
||||
.ok();
|
||||
self.maybe_schedule_upload(&now, tenant);
|
||||
|
||||
result
|
||||
if i + 1 % YIELD_ITERATIONS == 0 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
}
|
||||
|
||||
// Spawn tasks for as many of our pending tenants as we can.
|
||||
self.spawn_pending();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn spawn(
|
||||
&mut self,
|
||||
job: UploadPending,
|
||||
) -> (
|
||||
WriteInProgress,
|
||||
Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
|
||||
) {
|
||||
let UploadPending {
|
||||
///
|
||||
/// Cancellation: this method is cancel-safe.
|
||||
async fn process_next_completion(&mut self) {
|
||||
match self.task_result_rx.recv().await {
|
||||
Some(r) => {
|
||||
self.on_completion(r);
|
||||
}
|
||||
None => {
|
||||
unreachable!("Result sender is stored on Self");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The 'maybe' refers to the tenant's state: whether it is configured
|
||||
/// for heatmap uploads at all, and whether sufficient time has passed
|
||||
/// since the last upload.
|
||||
fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
|
||||
match tenant.get_heatmap_period() {
|
||||
None => {
|
||||
// Heatmaps are disabled for this tenant
|
||||
return;
|
||||
}
|
||||
Some(period) => {
|
||||
// If any tenant has asked for uploads more frequent than our scheduling interval,
|
||||
// reduce it to match so that we can keep up. This is mainly useful in testing, where
|
||||
// we may set rather short intervals.
|
||||
if period < self.scheduling_interval {
|
||||
self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Stale attachments do not upload anything: if we are in this state, there is probably some
|
||||
// other attachment in mode Single or Multi running on another pageserver, and we don't
|
||||
// want to thrash and overwrite their heatmap uploads.
|
||||
if tenant.get_attach_mode() == AttachmentMode::Stale {
|
||||
return;
|
||||
}
|
||||
|
||||
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
|
||||
// with the completion time in on_completion.
|
||||
let state = self
|
||||
.tenants
|
||||
.entry(*tenant.get_tenant_shard_id())
|
||||
.or_insert_with(|| UploaderTenantState {
|
||||
tenant: Arc::downgrade(&tenant),
|
||||
last_upload: None,
|
||||
next_upload: Some(Instant::now()),
|
||||
last_digest: None,
|
||||
});
|
||||
|
||||
// Decline to do the upload if insufficient time has passed
|
||||
if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
|
||||
return;
|
||||
}
|
||||
|
||||
let last_digest = state.last_digest;
|
||||
self.tenants_pending.push_back(UploadPending {
|
||||
tenant,
|
||||
last_digest,
|
||||
target_time,
|
||||
period,
|
||||
} = job;
|
||||
})
|
||||
}
|
||||
|
||||
fn spawn_pending(&mut self) {
|
||||
while !self.tenants_pending.is_empty()
|
||||
&& self.tenants_uploading.len() < self.concurrent_uploads
|
||||
{
|
||||
// unwrap: loop condition includes !is_empty()
|
||||
let pending = self.tenants_pending.pop_front().unwrap();
|
||||
self.spawn_upload(pending.tenant, pending.last_digest);
|
||||
}
|
||||
}
|
||||
|
||||
fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
|
||||
let remote_storage = self.remote_storage.clone();
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
||||
(WriteInProgress { barrier }, Box::pin(async move {
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
let result_tx = self.task_result_tx.clone();
|
||||
self.tasks.spawn(async move {
|
||||
// Guard for the barrier in [`WriteInProgress`]
|
||||
let _completion = completion;
|
||||
|
||||
@@ -263,47 +362,22 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
};
|
||||
|
||||
let now = Instant::now();
|
||||
|
||||
// If the job had a target execution time, we may check our final execution
|
||||
// time against that for observability purposes.
|
||||
if let (Some(target_time), Some(period)) = (target_time, period) {
|
||||
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||
let elapsed = now.duration_since(target_time);
|
||||
|
||||
warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
|
||||
}
|
||||
|
||||
let next_upload = tenant
|
||||
.get_heatmap_period()
|
||||
.and_then(|period| now.checked_add(period));
|
||||
|
||||
WriteComplete {
|
||||
result_tx
|
||||
.send(WriteComplete {
|
||||
tenant_shard_id: *tenant.get_tenant_shard_id(),
|
||||
completed_at: now,
|
||||
digest,
|
||||
next_upload,
|
||||
}
|
||||
}.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||
}
|
||||
})
|
||||
.ok();
|
||||
});
|
||||
|
||||
fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
|
||||
let tenant_shard_id = command.get_tenant_shard_id();
|
||||
|
||||
tracing::info!(
|
||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Starting heatmap write on command");
|
||||
let tenant = self
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(*tenant_shard_id, true)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
Ok(UploadPending {
|
||||
// Ignore our state for last digest: this forces an upload even if nothing has changed
|
||||
last_digest: None,
|
||||
tenant,
|
||||
target_time: None,
|
||||
period: None,
|
||||
})
|
||||
self.tenants_uploading
|
||||
.insert(tenant_shard_id, WriteInProgress { barrier });
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
|
||||
@@ -315,6 +389,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
digest,
|
||||
next_upload,
|
||||
} = completion;
|
||||
self.tenants_uploading.remove(&tenant_shard_id);
|
||||
use std::collections::hash_map::Entry;
|
||||
match self.tenants.entry(tenant_shard_id) {
|
||||
Entry::Vacant(_) => {
|
||||
@@ -327,6 +402,69 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_command(
|
||||
&mut self,
|
||||
command: UploadCommand,
|
||||
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
|
||||
) {
|
||||
match command {
|
||||
UploadCommand::Upload(tenant_shard_id) => {
|
||||
// If an upload was ongoing for this tenant, let it finish first.
|
||||
let barrier = if let Some(writing_state) =
|
||||
self.tenants_uploading.get(&tenant_shard_id)
|
||||
{
|
||||
tracing::info!(
|
||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Waiting for heatmap write to complete");
|
||||
writing_state.barrier.clone()
|
||||
} else {
|
||||
// Spawn the upload then immediately wait for it. This will block processing of other commands and
|
||||
// starting of other background work.
|
||||
tracing::info!(
|
||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Starting heatmap write on command");
|
||||
let tenant = match self
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id, true)
|
||||
{
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
// Drop result of send: we don't care if caller dropped their receiver
|
||||
drop(response_tx.send(CommandResponse {
|
||||
result: Err(e.into()),
|
||||
}));
|
||||
return;
|
||||
}
|
||||
};
|
||||
self.spawn_upload(tenant, None);
|
||||
let writing_state = self
|
||||
.tenants_uploading
|
||||
.get(&tenant_shard_id)
|
||||
.expect("We just inserted this");
|
||||
tracing::info!(
|
||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Waiting for heatmap upload to complete");
|
||||
|
||||
writing_state.barrier.clone()
|
||||
};
|
||||
|
||||
// This task does no I/O: it only listens for a barrier's completion and then
|
||||
// sends to the command response channel. It is therefore safe to spawn this without
|
||||
// any gates/task_mgr hooks.
|
||||
tokio::task::spawn(async move {
|
||||
barrier.wait().await;
|
||||
|
||||
tracing::info!(
|
||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||
"Heatmap upload complete");
|
||||
|
||||
// Drop result of send: we don't care if caller dropped their receiver
|
||||
drop(response_tx.send(CommandResponse { result: Ok(()) }))
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum UploadHeatmapOutcome {
|
||||
@@ -349,6 +487,7 @@ enum UploadHeatmapError {
|
||||
|
||||
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
|
||||
/// of the object we would have uploaded.
|
||||
#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
|
||||
async fn upload_tenant_heatmap(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenant: &Arc<Tenant>,
|
||||
|
||||
@@ -1,359 +0,0 @@
|
||||
use futures::Future;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
marker::PhantomData,
|
||||
pin::Pin,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{completion::Barrier, yielding_loop::yielding_loop};
|
||||
|
||||
use super::{CommandRequest, CommandResponse};
|
||||
|
||||
/// Scheduling interval is the time between calls to JobGenerator::schedule.
|
||||
/// When we schedule jobs, the job generator may provide a hint of its preferred
|
||||
/// interval, which we will respect within these intervals.
|
||||
const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
|
||||
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Scheduling helper for background work across many tenants.
|
||||
///
|
||||
/// Systems that need to run background work across many tenants may use this type
|
||||
/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
|
||||
/// implementation to provide the work to execute. This is a simple scheduler that just
|
||||
/// polls the generator for outstanding work, replacing its queue of pending work with
|
||||
/// what the generator yields on each call: the job generator can change its mind about
|
||||
/// the order of jobs between calls. The job generator is notified when jobs complete,
|
||||
/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
|
||||
/// admin APIs).
|
||||
///
|
||||
/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
|
||||
///
|
||||
/// G: A JobGenerator that this scheduler will poll to find pending jobs
|
||||
/// PJ: 'Pending Job': type for job descriptors that are ready to run
|
||||
/// RJ: 'Running Job' type' for jobs that have been spawned
|
||||
/// C : 'Completion' type that spawned jobs will send when they finish
|
||||
/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
|
||||
pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
|
||||
where
|
||||
G: JobGenerator<PJ, RJ, C, CMD>,
|
||||
C: Completion,
|
||||
PJ: PendingJob,
|
||||
RJ: RunningJob,
|
||||
{
|
||||
generator: G,
|
||||
|
||||
/// Ready to run. Will progress to `running` once concurrent limit is satisfied, or
|
||||
/// be removed on next scheduling pass.
|
||||
pending: std::collections::VecDeque<PJ>,
|
||||
|
||||
/// Tasks currently running in Self::tasks for these tenants. Check this map
|
||||
/// before pushing more work into pending for the same tenant.
|
||||
running: HashMap<TenantShardId, RJ>,
|
||||
|
||||
tasks: JoinSet<C>,
|
||||
|
||||
concurrency: usize,
|
||||
|
||||
/// How often we would like schedule_interval to be called.
|
||||
pub(super) scheduling_interval: Duration,
|
||||
|
||||
_phantom: PhantomData<(PJ, RJ, C, CMD)>,
|
||||
}
|
||||
|
||||
pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
|
||||
where
|
||||
C: Completion,
|
||||
PJ: PendingJob,
|
||||
RJ: RunningJob,
|
||||
{
|
||||
/// Called at each scheduling interval. Return a list of jobs to run, most urgent first.
|
||||
///
|
||||
/// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
|
||||
/// Implementations should take care to yield the executor periodically if running
|
||||
/// very long loops.
|
||||
///
|
||||
/// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
|
||||
/// jobs is not drained by the next scheduling interval, pending jobs will be cleared
|
||||
/// and re-generated.
|
||||
async fn schedule(&mut self) -> SchedulingResult<PJ>;
|
||||
|
||||
/// Called when a pending job is ready to be run.
|
||||
///
|
||||
/// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
|
||||
fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
|
||||
|
||||
/// Called when a job previously spawned with spawn() transmits its completion
|
||||
fn on_completion(&mut self, completion: C);
|
||||
|
||||
/// Called when a command is received. A job will be spawned immediately if the return
|
||||
/// value is Some, ignoring concurrency limits and the pending queue.
|
||||
fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
|
||||
}
|
||||
|
||||
/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
|
||||
pub(super) struct SchedulingResult<PJ> {
|
||||
pub(super) jobs: Vec<PJ>,
|
||||
/// The job generator would like to be called again this soon
|
||||
pub(super) want_interval: Option<Duration>,
|
||||
}
|
||||
|
||||
/// See [`TenantBackgroundJobs`].
|
||||
pub(super) trait PendingJob {
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId;
|
||||
}
|
||||
|
||||
/// See [`TenantBackgroundJobs`].
|
||||
pub(super) trait Completion: Send + 'static {
|
||||
fn get_tenant_shard_id(&self) -> &TenantShardId;
|
||||
}
|
||||
|
||||
/// See [`TenantBackgroundJobs`].
|
||||
pub(super) trait RunningJob {
|
||||
fn get_barrier(&self) -> Barrier;
|
||||
}
|
||||
|
||||
impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
|
||||
where
|
||||
C: Completion,
|
||||
PJ: PendingJob,
|
||||
RJ: RunningJob,
|
||||
G: JobGenerator<PJ, RJ, C, CMD>,
|
||||
{
|
||||
pub(super) fn new(generator: G, concurrency: usize) -> Self {
|
||||
Self {
|
||||
generator,
|
||||
pending: std::collections::VecDeque::new(),
|
||||
running: HashMap::new(),
|
||||
tasks: JoinSet::new(),
|
||||
concurrency,
|
||||
scheduling_interval: MAX_SCHEDULING_INTERVAL,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn run(
|
||||
&mut self,
|
||||
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
|
||||
background_jobs_can_start: Barrier,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
tracing::info!("Waiting for background_jobs_can start...");
|
||||
background_jobs_can_start.wait().await;
|
||||
tracing::info!("background_jobs_can is ready, proceeding.");
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
// Look for new work: this is relatively expensive because we have to go acquire the lock on
|
||||
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
|
||||
// require an upload.
|
||||
self.schedule_iteration(&cancel).await;
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Schedule some work, if concurrency limit permits it
|
||||
self.spawn_pending();
|
||||
|
||||
// Between scheduling iterations, we will:
|
||||
// - Drain any complete tasks and spawn pending tasks
|
||||
// - Handle incoming administrative commands
|
||||
// - Check our cancellation token
|
||||
let next_scheduling_iteration = Instant::now()
|
||||
.checked_add(self.scheduling_interval)
|
||||
.unwrap_or_else(|| {
|
||||
tracing::warn!(
|
||||
"Scheduling interval invalid ({}s)",
|
||||
self.scheduling_interval.as_secs_f64()
|
||||
);
|
||||
// unwrap(): this constant is small, cannot fail to add to time unless
|
||||
// we are close to the end of the universe.
|
||||
Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
|
||||
});
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("joining tasks");
|
||||
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
|
||||
// It is the callers responsibility to make sure that the tasks they scheduled
|
||||
// respect an appropriate cancellation token, to shut down promptly. It is only
|
||||
// safe to wait on joining these tasks because we can see the cancellation token
|
||||
// has been set.
|
||||
while let Some(_r) = self.tasks.join_next().await {}
|
||||
tracing::info!("terminating on cancellation token.");
|
||||
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
|
||||
tracing::debug!("woke for scheduling interval");
|
||||
break;},
|
||||
cmd = command_queue.recv() => {
|
||||
tracing::debug!("woke for command queue");
|
||||
let cmd = match cmd {
|
||||
Some(c) =>c,
|
||||
None => {
|
||||
// SecondaryController was destroyed, and this has raced with
|
||||
// our CancellationToken
|
||||
tracing::info!("terminating on command queue destruction");
|
||||
cancel.cancel();
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
let CommandRequest{
|
||||
response_tx,
|
||||
payload
|
||||
} = cmd;
|
||||
self.handle_command(payload, response_tx);
|
||||
},
|
||||
_ = async {
|
||||
let completion = self.process_next_completion().await;
|
||||
match completion {
|
||||
Some(c) => {
|
||||
self.generator.on_completion(c);
|
||||
if !cancel.is_cancelled() {
|
||||
self.spawn_pending();
|
||||
}
|
||||
},
|
||||
None => {
|
||||
// Nothing is running, so just wait: expect that this future
|
||||
// will be dropped when something in the outer select! fires.
|
||||
cancel.cancelled().await;
|
||||
}
|
||||
}
|
||||
|
||||
} => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn do_spawn(&mut self, job: PJ) {
|
||||
let tenant_shard_id = *job.get_tenant_shard_id();
|
||||
let (in_progress, fut) = self.generator.spawn(job);
|
||||
|
||||
self.tasks.spawn(fut);
|
||||
|
||||
self.running.insert(tenant_shard_id, in_progress);
|
||||
}
|
||||
|
||||
/// For all pending tenants that are elegible for execution, spawn their task.
|
||||
///
|
||||
/// Caller provides the spawn operation, we track the resulting execution.
|
||||
fn spawn_pending(&mut self) {
|
||||
while !self.pending.is_empty() && self.running.len() < self.concurrency {
|
||||
// unwrap: loop condition includes !is_empty()
|
||||
let pending = self.pending.pop_front().unwrap();
|
||||
self.do_spawn(pending);
|
||||
}
|
||||
}
|
||||
|
||||
/// For administrative commands: skip the pending queue, ignore concurrency limits
|
||||
fn spawn_now(&mut self, job: PJ) -> &RJ {
|
||||
let tenant_shard_id = *job.get_tenant_shard_id();
|
||||
self.do_spawn(job);
|
||||
self.running
|
||||
.get(&tenant_shard_id)
|
||||
.expect("We just inserted this")
|
||||
}
|
||||
|
||||
/// Wait until the next task completes, and handle its completion
|
||||
///
|
||||
/// Cancellation: this method is cancel-safe.
|
||||
async fn process_next_completion(&mut self) -> Option<C> {
|
||||
match self.tasks.join_next().await {
|
||||
Some(r) => {
|
||||
// We use a channel to drive completions, but also
|
||||
// need to drain the JoinSet to avoid completed tasks
|
||||
// accumulating. These calls are 1:1 because every task
|
||||
// we spawn into this joinset submits is result to the channel.
|
||||
let completion = r.expect("Panic in background task");
|
||||
|
||||
self.running.remove(completion.get_tenant_shard_id());
|
||||
Some(completion)
|
||||
}
|
||||
None => {
|
||||
// Nothing is running, so we have nothing to wait for. We may drop out: the
|
||||
// main even loop will call us again after the next time it has run something.
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert the command into a pending job, spawn it, and when the spawned
|
||||
/// job completes, send the result down `response_tx`.
|
||||
fn handle_command(
|
||||
&mut self,
|
||||
cmd: CMD,
|
||||
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
|
||||
) {
|
||||
let job = match self.generator.on_command(cmd) {
|
||||
Ok(j) => j,
|
||||
Err(e) => {
|
||||
response_tx.send(CommandResponse { result: Err(e) }).ok();
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let tenant_shard_id = job.get_tenant_shard_id();
|
||||
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||
barrier
|
||||
} else {
|
||||
let running = self.spawn_now(job);
|
||||
running.get_barrier().clone()
|
||||
};
|
||||
|
||||
// This task does no I/O: it only listens for a barrier's completion and then
|
||||
// sends to the command response channel. It is therefore safe to spawn this without
|
||||
// any gates/task_mgr hooks.
|
||||
tokio::task::spawn(async move {
|
||||
barrier.wait().await;
|
||||
|
||||
response_tx.send(CommandResponse { result: Ok(()) }).ok();
|
||||
});
|
||||
}
|
||||
|
||||
fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
|
||||
self.running.get(tenant_shard_id).map(|r| r.get_barrier())
|
||||
}
|
||||
|
||||
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
|
||||
///
|
||||
/// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
|
||||
///
|
||||
/// This function resets the pending list: it is assumed that the caller may change their mind about
|
||||
/// which tenants need work between calls to schedule_iteration.
|
||||
async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
|
||||
let SchedulingResult {
|
||||
jobs,
|
||||
want_interval,
|
||||
} = self.generator.schedule().await;
|
||||
|
||||
// Adjust interval based on feedback from the job generator
|
||||
if let Some(want_interval) = want_interval {
|
||||
// Calculation uses second granularity: this scheduler is not intended for high frequency tasks
|
||||
self.scheduling_interval = Duration::from_secs(std::cmp::min(
|
||||
std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
|
||||
MAX_SCHEDULING_INTERVAL.as_secs(),
|
||||
));
|
||||
}
|
||||
|
||||
// The priority order of previously scheduled work may be invalidated by current state: drop
|
||||
// all pending work (it will be re-scheduled if still needed)
|
||||
self.pending.clear();
|
||||
|
||||
// While iterating over the potentially-long list of tenants, we will periodically yield
|
||||
// to avoid blocking executor.
|
||||
yielding_loop(1000, cancel, jobs.into_iter(), |job| {
|
||||
// Skip tenants that already have a write in flight
|
||||
if !self.running.contains_key(job.get_tenant_shard_id()) {
|
||||
self.pending.push_back(job);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user