mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-05 03:30:36 +00:00
Compare commits
101 Commits
proxy-test
...
jcsp/contr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6686367452 | ||
|
|
9085b0b7a2 | ||
|
|
24b01c1652 | ||
|
|
ee558cb77b | ||
|
|
94f6b488ed | ||
|
|
a12e4261a3 | ||
|
|
cd449d66ea | ||
|
|
6f8f7c7de9 | ||
|
|
12487e662d | ||
|
|
5bcae3a86e | ||
|
|
47657f2df4 | ||
|
|
d669dacd71 | ||
|
|
837988b6c9 | ||
|
|
9c6145f0a9 | ||
|
|
2424d90883 | ||
|
|
cf3baf6039 | ||
|
|
9c48b5c4ab | ||
|
|
c671aeacd4 | ||
|
|
bc7a82caf2 | ||
|
|
b5246753bf | ||
|
|
c1095f4c52 | ||
|
|
1718c0b59b | ||
|
|
8107ae8377 | ||
|
|
555ee9fdd0 | ||
|
|
6921577cec | ||
|
|
20fff05699 | ||
|
|
f2767d2056 | ||
|
|
76b92e3389 | ||
|
|
03f8a42ed9 | ||
|
|
60e5a56a5a | ||
|
|
afda4420bd | ||
|
|
ce1673a8c4 | ||
|
|
532b0fa52b | ||
|
|
4de2f0f3e0 | ||
|
|
41464325c7 | ||
|
|
7257ffbf75 | ||
|
|
84f027357d | ||
|
|
428d9fe69e | ||
|
|
e0af945f8f | ||
|
|
e7452d3756 | ||
|
|
5d6083bfc6 | ||
|
|
3882f57001 | ||
|
|
04190a1fea | ||
|
|
fcbe9fb184 | ||
|
|
cbb599f353 | ||
|
|
e49602ecf5 | ||
|
|
eb02f4619e | ||
|
|
9b8df2634f | ||
|
|
d152d4f16f | ||
|
|
b467d8067b | ||
|
|
a48b23d777 | ||
|
|
21a86487a2 | ||
|
|
686b3c79c8 | ||
|
|
02a8b7fbe0 | ||
|
|
feb359b459 | ||
|
|
0c105ef352 | ||
|
|
4f7704af24 | ||
|
|
e0c12faabd | ||
|
|
2f8a2681b8 | ||
|
|
7e4280955e | ||
|
|
349b375010 | ||
|
|
d0d4871682 | ||
|
|
587cb705b8 | ||
|
|
4d2bf55e6c | ||
|
|
5667372c61 | ||
|
|
61f99d703d | ||
|
|
24014d8383 | ||
|
|
e3ded64d1b | ||
|
|
9b714c8572 | ||
|
|
29fb675432 | ||
|
|
ca07fa5f8b | ||
|
|
5d039c6e9b | ||
|
|
36e1100949 | ||
|
|
59c5b374de | ||
|
|
0f3b87d023 | ||
|
|
c19625a29c | ||
|
|
f2e5212fed | ||
|
|
568bc1fde3 | ||
|
|
45e929c069 | ||
|
|
6b980f38da | ||
|
|
f0d8bd7855 | ||
|
|
046d9c69e6 | ||
|
|
c72cb44213 | ||
|
|
cd3e4ac18d | ||
|
|
9ad940086c | ||
|
|
936f2ee2a5 | ||
|
|
1af047dd3e | ||
|
|
5fa747e493 | ||
|
|
80854b98ff | ||
|
|
024372a3db | ||
|
|
fff2468aa2 | ||
|
|
c7538a2c20 | ||
|
|
a2d0d44b42 | ||
|
|
7d3cdc05d4 | ||
|
|
840abe3954 | ||
|
|
774a6e7475 | ||
|
|
df5d588f63 | ||
|
|
f39b0fce9b | ||
|
|
a9ec4eb4fc | ||
|
|
a97b54e3b9 | ||
|
|
a5114a99b2 |
@@ -39,7 +39,7 @@ runs:
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
@@ -59,7 +59,7 @@ runs:
|
||||
BUCKET: neon-github-public-dev
|
||||
|
||||
# TODO: We can replace with a special docker image with Java and Allure pre-installed
|
||||
- uses: actions/setup-java@v3
|
||||
- uses: actions/setup-java@v4
|
||||
with:
|
||||
distribution: 'temurin'
|
||||
java-version: '17'
|
||||
@@ -76,8 +76,8 @@ runs:
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.24.0
|
||||
ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
|
||||
ALLURE_VERSION: 2.27.0
|
||||
ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777
|
||||
|
||||
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
|
||||
- name: Acquire lock
|
||||
@@ -180,7 +180,7 @@ runs:
|
||||
fi
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -215,7 +215,7 @@ runs:
|
||||
rm -rf ${WORKDIR}
|
||||
fi
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
- uses: actions/github-script@v7
|
||||
if: always()
|
||||
env:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
|
||||
|
||||
@@ -19,7 +19,7 @@ runs:
|
||||
PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${PR_NUMBER}" != "null" ]; then
|
||||
BRANCH_OR_PR=pr-${PR_NUMBER}
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
|
||||
elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
|
||||
# Shortcut for special branches
|
||||
BRANCH_OR_PR=${GITHUB_REF_NAME}
|
||||
else
|
||||
|
||||
@@ -80,13 +80,13 @@ runs:
|
||||
|
||||
- name: Checkout
|
||||
if: inputs.needs_postgres_source == 'true'
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
8
.github/workflows/actionlint.yml
vendored
8
.github/workflows/actionlint.yml
vendored
@@ -16,8 +16,14 @@ concurrency:
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
jobs:
|
||||
actionlint:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
actionlint:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
2
.github/workflows/approved-for-ci-run.yml
vendored
2
.github/workflows/approved-for-ci-run.yml
vendored
@@ -64,7 +64,7 @@ jobs:
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
token: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
|
||||
10
.github/workflows/benchmarking.yml
vendored
10
.github/workflows/benchmarking.yml
vendored
@@ -66,7 +66,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -221,7 +221,7 @@ jobs:
|
||||
timeout-minutes: 480
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -366,7 +366,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -465,7 +465,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -562,7 +562,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
|
||||
379
.github/workflows/build_and_test.yml
vendored
379
.github/workflows/build_and_test.yml
vendored
@@ -5,6 +5,7 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release
|
||||
- release-proxy
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -27,24 +28,9 @@ env:
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Disallow PRs from forks
|
||||
if: |
|
||||
github.event_name == 'pull_request' &&
|
||||
github.event.pull_request.head.repo.full_name != github.repository
|
||||
|
||||
run: |
|
||||
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
|
||||
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
|
||||
else
|
||||
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
|
||||
fi
|
||||
|
||||
echo >&2 "We don't run CI for PRs from forks"
|
||||
echo >&2 "${MESSAGE}"
|
||||
|
||||
exit 1
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
cancel-previous-e2e-tests:
|
||||
needs: [ check-permissions ]
|
||||
@@ -69,7 +55,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -82,6 +68,8 @@ jobs:
|
||||
echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
|
||||
@@ -106,13 +94,13 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -138,7 +126,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -146,7 +134,7 @@ jobs:
|
||||
# Disabled for now
|
||||
# - name: Restore cargo deps cache
|
||||
# id: cache_cargo
|
||||
# uses: actions/cache@v3
|
||||
# uses: actions/cache@v4
|
||||
# with:
|
||||
# path: |
|
||||
# !~/.cargo/registry/src
|
||||
@@ -231,7 +219,7 @@ jobs:
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -253,7 +241,7 @@ jobs:
|
||||
done
|
||||
|
||||
if [ "${FAILED}" = "true" ]; then
|
||||
echo >&2 "Please update vendors/revisions.json if these changes are intentional"
|
||||
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -303,7 +291,7 @@ jobs:
|
||||
# compressed crates.
|
||||
# - name: Cache cargo deps
|
||||
# id: cache_cargo
|
||||
# uses: actions/cache@v3
|
||||
# uses: actions/cache@v4
|
||||
# with:
|
||||
# path: |
|
||||
# ~/.cargo/registry/
|
||||
@@ -317,21 +305,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -451,7 +439,7 @@ jobs:
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -472,9 +460,14 @@ jobs:
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
if: |
|
||||
false &&
|
||||
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
get-benchmarks-durations:
|
||||
@@ -488,10 +481,10 @@ jobs:
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
@@ -525,7 +518,7 @@ jobs:
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Pytest benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -554,7 +547,7 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -565,7 +558,7 @@ jobs:
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
- uses: actions/github-script@v7
|
||||
if: ${{ !cancelled() }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
@@ -605,7 +598,7 @@ jobs:
|
||||
coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
@@ -674,7 +667,7 @@ jobs:
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json
|
||||
echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
- uses: actions/github-script@v7
|
||||
env:
|
||||
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
@@ -692,7 +685,7 @@ jobs:
|
||||
})
|
||||
|
||||
trigger-e2e-tests:
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
|
||||
if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
|
||||
needs: [ check-permissions, promote-images, tag ]
|
||||
uses: ./.github/workflows/trigger-e2e-tests.yml
|
||||
secrets: inherit
|
||||
@@ -700,158 +693,173 @@ jobs:
|
||||
neon-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Kaniko build neon
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
--build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
|
||||
TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile
|
||||
cache-from: type=registry,ref=neondatabase/neon:cache
|
||||
cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
- name: Kaniko build compute tools
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: gcr.io/kaniko-project/executor:v1.9.2-debug
|
||||
# Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
|
||||
# Should be prevented by https://github.com/neondatabase/neon/issues/4281
|
||||
options: --add-host=download.osgeo.org:140.211.15.30
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v1 # v3 won't work with kaniko
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure ECR and Docker Hub login
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
|
||||
echo "::add-mask::${DOCKERHUB_AUTH}"
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
|
||||
cat <<-EOF > /kaniko/.docker/config.json
|
||||
{
|
||||
"auths": {
|
||||
"https://index.docker.io/v1/": {
|
||||
"auth": "${DOCKERHUB_AUTH}"
|
||||
}
|
||||
},
|
||||
"credHelpers": {
|
||||
"369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Kaniko build compute node with extensions
|
||||
run:
|
||||
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
--cleanup
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-tools
|
||||
cache-from: type=registry,ref=neondatabase/compute-tools:cache
|
||||
cache-to: type=registry,ref=neondatabase/compute-tools:cache,mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
compute-node-image:
|
||||
needs: [ check-permissions, build-buildtools-image, tag ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
version: [ v14, v15, v16 ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
# Disable parallelism for docker buildkit.
|
||||
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
||||
config-inline: |
|
||||
[worker.oci]
|
||||
max-parallelism = 1
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
|
||||
PG_VERSION=${{ matrix.version }}
|
||||
BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
|
||||
REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.compute-node
|
||||
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
|
||||
cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
|
||||
tags: |
|
||||
369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
vm-compute-node-image:
|
||||
needs: [ check-permissions, tag, compute-node-image ]
|
||||
@@ -900,7 +908,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -962,9 +970,7 @@ jobs:
|
||||
crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
@@ -976,9 +982,7 @@ jobs:
|
||||
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
|
||||
|
||||
- name: Push images to production ECR
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
|
||||
@@ -1002,9 +1006,7 @@ jobs:
|
||||
crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push latest tags to Docker Hub
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
run: |
|
||||
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
|
||||
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
|
||||
@@ -1094,7 +1096,7 @@ jobs:
|
||||
|
||||
deploy:
|
||||
needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
|
||||
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
|
||||
@@ -1114,7 +1116,7 @@ jobs:
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 0
|
||||
@@ -1129,15 +1131,29 @@ jobs:
|
||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
-f deployStorageBroker=false \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create git tag
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v6
|
||||
if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
@@ -1149,9 +1165,10 @@ jobs:
|
||||
sha: context.sha,
|
||||
})
|
||||
|
||||
# TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
|
||||
- name: Create GitHub release
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v6
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
|
||||
36
.github/workflows/check-permissions.yml
vendored
Normal file
36
.github/workflows/check-permissions.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Check Permissions
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
github-event-name:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Disallow CI runs on PRs from forks
|
||||
if: |
|
||||
inputs.github-event-name == 'pull_request' &&
|
||||
github.event.pull_request.head.repo.full_name != github.repository
|
||||
run: |
|
||||
if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
|
||||
MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
|
||||
else
|
||||
MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
|
||||
fi
|
||||
|
||||
# TODO: use actions/github-script to post this message as a PR comment
|
||||
echo >&2 "We don't run CI for PRs from forks"
|
||||
echo >&2 "${MESSAGE}"
|
||||
|
||||
exit 1
|
||||
28
.github/workflows/neon_extra_builds.yml
vendored
28
.github/workflows/neon_extra_builds.yml
vendored
@@ -20,7 +20,14 @@ env:
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name}}
|
||||
|
||||
check-macos-build:
|
||||
needs: [ check-permissions ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
@@ -57,21 +64,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -82,7 +89,7 @@ jobs:
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Cache cargo deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
@@ -116,8 +123,8 @@ jobs:
|
||||
run: ./run_clippy.sh
|
||||
|
||||
check-linux-arm-build:
|
||||
needs: [ check-permissions ]
|
||||
timeout-minutes: 90
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
|
||||
env:
|
||||
@@ -172,21 +179,21 @@ jobs:
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
@@ -237,8 +244,8 @@ jobs:
|
||||
cargo nextest run --package remote_storage --test test_real_azure
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
needs: [ check-permissions ]
|
||||
timeout-minutes: 90
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
runs-on: [ self-hosted, dev, arm64 ]
|
||||
|
||||
container:
|
||||
@@ -309,6 +316,7 @@ jobs:
|
||||
run: cargo deny check
|
||||
|
||||
gather-rust-build-stats:
|
||||
needs: [ check-permissions ]
|
||||
if: |
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
|
||||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
|
||||
@@ -356,7 +364,7 @@ jobs:
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Publish build stats report
|
||||
uses: actions/github-script@v6
|
||||
uses: actions/github-script@v7
|
||||
env:
|
||||
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
|
||||
6
.github/workflows/pg_clients.yml
vendored
6
.github/workflows/pg_clients.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
@@ -38,7 +38,7 @@ jobs:
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
|
||||
@@ -82,7 +82,7 @@ jobs:
|
||||
# It will be fixed after switching to gen2 runner
|
||||
- name: Upload python test logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
retention-days: 7
|
||||
name: python-test-pg_clients-${{ runner.os }}-stage-logs
|
||||
|
||||
83
.github/workflows/release.yml
vendored
83
.github/workflows/release.yml
vendored
@@ -2,12 +2,31 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * 1'
|
||||
# It should be kept in sync with if-condition in jobs
|
||||
- cron: '0 6 * * MON' # Storage release
|
||||
- cron: '0 6 * * THU' # Proxy release
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
create-storage-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Storage release PR'
|
||||
required: false
|
||||
create-proxy-release-branch:
|
||||
type: boolean
|
||||
description: 'Create Proxy release PR'
|
||||
required: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
jobs:
|
||||
create_release_branch:
|
||||
runs-on: [ ubuntu-latest ]
|
||||
create-storage-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
@@ -18,27 +37,67 @@ jobs:
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- name: Get current date
|
||||
id: date
|
||||
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
|
||||
- name: Set environment variables
|
||||
run: |
|
||||
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
|
||||
- name: Create release branch
|
||||
run: git checkout -b releases/${{ steps.date.outputs.date }}
|
||||
run: git checkout -b $RELEASE_BRANCH
|
||||
|
||||
- name: Push new branch
|
||||
run: git push origin releases/${{ steps.date.outputs.date }}
|
||||
run: git push origin $RELEASE_BRANCH
|
||||
|
||||
- name: Create pull request into release
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Release ${{ steps.date.outputs.date }}
|
||||
## Release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this PR using 'Create a merge commit'!**
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "Release ${{ steps.date.outputs.date }}" \
|
||||
gh pr create --title "Release ${RELEASE_DATE}" \
|
||||
--body-file "body.md" \
|
||||
--head "releases/${{ steps.date.outputs.date }}" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release"
|
||||
|
||||
create-proxy-release-branch:
|
||||
if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
permissions:
|
||||
contents: write # for `git push`
|
||||
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- name: Set environment variables
|
||||
run: |
|
||||
echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
|
||||
|
||||
- name: Create release branch
|
||||
run: git checkout -b $RELEASE_BRANCH
|
||||
|
||||
- name: Push new branch
|
||||
run: git push origin $RELEASE_BRANCH
|
||||
|
||||
- name: Create pull request into release
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
cat << EOF > body.md
|
||||
## Proxy release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "Proxy release ${RELEASE_DATE}}" \
|
||||
--body-file "body.md" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release-proxy"
|
||||
|
||||
7
.github/workflows/trigger-e2e-tests.yml
vendored
7
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -9,7 +9,7 @@ on:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
|
||||
env:
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -51,6 +51,8 @@ jobs:
|
||||
echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
|
||||
@@ -115,4 +117,3 @@ jobs:
|
||||
\"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/control_plane/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
|
||||
/control_plane/attachment_service @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/proxy/ @neondatabase/proxy
|
||||
|
||||
21
Cargo.lock
generated
21
Cargo.lock
generated
@@ -284,8 +284,10 @@ dependencies = [
|
||||
"diesel_migrations",
|
||||
"futures",
|
||||
"git-version",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
@@ -1813,6 +1815,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
|
||||
dependencies = [
|
||||
"enumset_derive",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2757,6 +2760,17 @@ version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "leaky-bucket"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
|
||||
dependencies = [
|
||||
"parking_lot 0.12.1",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.150"
|
||||
@@ -3448,6 +3462,7 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@@ -3475,6 +3490,7 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"leaky-bucket",
|
||||
"md5",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -3536,7 +3552,9 @@ dependencies = [
|
||||
"const_format",
|
||||
"enum-map",
|
||||
"hex",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
@@ -4436,6 +4454,7 @@ dependencies = [
|
||||
"futures",
|
||||
"futures-util",
|
||||
"http-types",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"metrics",
|
||||
@@ -4447,6 +4466,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"test-context",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
@@ -6345,6 +6365,7 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"leaky-bucket",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
|
||||
@@ -97,6 +97,7 @@ ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
|
||||
@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& mold -run cargo build \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
--bin pagectl \
|
||||
|
||||
@@ -769,6 +769,40 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_ivm"
|
||||
# compile pg_ivm extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-ivm-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
|
||||
echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_partman"
|
||||
# compile pg_partman extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-partman-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
|
||||
echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -810,6 +844,8 @@ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -820,6 +856,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_test_utils \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_rmgr \
|
||||
|
||||
16
Makefile
16
Makefile
@@ -159,8 +159,8 @@ neon-pg-ext-%: postgres-%
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
|
||||
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
.PHONY: neon-pg-clean-ext-%
|
||||
neon-pg-clean-ext-%:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
@@ -216,11 +216,11 @@ neon-pg-ext: \
|
||||
neon-pg-ext-v15 \
|
||||
neon-pg-ext-v16
|
||||
|
||||
.PHONY: neon-pg-ext-clean
|
||||
neon-pg-ext-clean: \
|
||||
neon-pg-ext-clean-v14 \
|
||||
neon-pg-ext-clean-v15 \
|
||||
neon-pg-ext-clean-v16
|
||||
.PHONY: neon-pg-clean-ext
|
||||
neon-pg-clean-ext: \
|
||||
neon-pg-clean-ext-v14 \
|
||||
neon-pg-clean-ext-v15 \
|
||||
neon-pg-clean-ext-v16
|
||||
|
||||
# shorthand to build all Postgres versions
|
||||
.PHONY: postgres
|
||||
@@ -249,7 +249,7 @@ postgres-check: \
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean: postgres-clean neon-pg-ext-clean
|
||||
clean: postgres-clean neon-pg-clean-ext
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
# This removes everything
|
||||
|
||||
12
README.md
12
README.md
@@ -5,7 +5,7 @@
|
||||
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
## Quick start
|
||||
Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
|
||||
Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
|
||||
|
||||
Alternatively, compile and run the project [locally](#running-local-installation).
|
||||
|
||||
@@ -249,6 +249,16 @@ testing locally, it is convenient to run just one set of permutations, like this
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
You may find yourself in need of flamegraphs for software in this repository.
|
||||
You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
|
||||
|
||||
>[!IMPORTANT]
|
||||
> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
|
||||
> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
|
||||
> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
|
||||
|
||||
## Documentation
|
||||
|
||||
[docs](/docs) Contains a top-level overview of all available markdown documentation.
|
||||
|
||||
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info};
|
||||
@@ -53,7 +52,9 @@ use url::Url;
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
||||
use compute_tools::compute::{
|
||||
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
|
||||
};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::get_pg_version;
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
|
||||
info!("synced safekeepers at lsn {lsn}");
|
||||
}
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::TerminationPending {
|
||||
state.status = ComputeStatus::Terminated;
|
||||
compute.state_changed.notify_all();
|
||||
// we were asked to terminate gracefully, don't exit to avoid restart
|
||||
delay_exit = true
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if let Err(err) = compute.check_for_core_dumps() {
|
||||
error!("error while checking for core dumps: {err:?}");
|
||||
}
|
||||
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
|
||||
/// wait for termination which would be easy then.
|
||||
fn handle_exit_signal(sig: i32) {
|
||||
info!("received {sig} termination signal");
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
kill(pg_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
forward_termination_signal();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::os::unix::fs::{symlink, PermissionsExt};
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
@@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
|
||||
use crate::checker::create_availability_check_data;
|
||||
@@ -324,7 +326,8 @@ impl ComputeNode {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Instant::now();
|
||||
|
||||
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
|
||||
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
|
||||
let mut config = postgres::Config::from_str(shard0_connstr)?;
|
||||
|
||||
// Use the storage auth token from the config file, if given.
|
||||
// Note: this overrides any password set in the connection string.
|
||||
@@ -634,6 +637,48 @@ impl ComputeNode {
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
// Place pg_dynshmem under /dev/shm. This allows us to use
|
||||
// 'dynamic_shared_memory_type = mmap' so that the files are placed in
|
||||
// /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works.
|
||||
//
|
||||
// Why on earth don't we just stick to the 'posix' default, you might
|
||||
// ask. It turns out that making large allocations with 'posix' doesn't
|
||||
// work very well with autoscaling. The behavior we want is that:
|
||||
//
|
||||
// 1. You can make large DSM allocations, larger than the current RAM
|
||||
// size of the VM, without errors
|
||||
//
|
||||
// 2. If the allocated memory is really used, the VM is scaled up
|
||||
// automatically to accommodate that
|
||||
//
|
||||
// We try to make that possible by having swap in the VM. But with the
|
||||
// default 'posix' DSM implementation, we fail step 1, even when there's
|
||||
// plenty of swap available. PostgreSQL uses posix_fallocate() to create
|
||||
// the shmem segment, which is really just a file in /dev/shm in Linux,
|
||||
// but posix_fallocate() on tmpfs returns ENOMEM if the size is larger
|
||||
// than available RAM.
|
||||
//
|
||||
// Using 'dynamic_shared_memory_type = mmap' works around that, because
|
||||
// the Postgres 'mmap' DSM implementation doesn't use
|
||||
// posix_fallocate(). Instead, it uses repeated calls to write(2) to
|
||||
// fill the file with zeros. It's weird that that differs between
|
||||
// 'posix' and 'mmap', but we take advantage of it. When the file is
|
||||
// filled slowly with write(2), the kernel allows it to grow larger, as
|
||||
// long as there's swap available.
|
||||
//
|
||||
// In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM
|
||||
// segment to be larger than currently available RAM. But because we
|
||||
// don't want to store it on a real file, which the kernel would try to
|
||||
// flush to disk, so symlink pg_dynshm to /dev/shm.
|
||||
//
|
||||
// We don't set 'dynamic_shared_memory_type = mmap' here, we let the
|
||||
// control plane control that option. If 'mmap' is not used, this
|
||||
// symlink doesn't affect anything.
|
||||
//
|
||||
// See https://github.com/neondatabase/autoscaling/issues/800
|
||||
std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
|
||||
symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
|
||||
|
||||
match spec.mode {
|
||||
ComputeMode::Primary => {}
|
||||
ComputeMode::Replica | ComputeMode::Static(..) => {
|
||||
@@ -1279,3 +1324,17 @@ LIMIT 100",
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal() {
|
||||
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||
if ss_pid != 0 {
|
||||
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||
kill(ss_pid, Signal::SIGTERM).ok();
|
||||
}
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
|
||||
kill(pg_pid, Signal::SIGQUIT).ok();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,6 +51,9 @@ pub fn write_postgres_conf(
|
||||
if let Some(s) = &spec.pageserver_connstring {
|
||||
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(stripe_size) = spec.shard_stripe_size {
|
||||
writeln!(file, "neon.stripe_size={stripe_size}")?;
|
||||
}
|
||||
if !spec.safekeeper_connstrings.is_empty() {
|
||||
writeln!(
|
||||
file,
|
||||
@@ -79,6 +82,12 @@ pub fn write_postgres_conf(
|
||||
ComputeMode::Replica => {
|
||||
// hot_standby is 'on' by default, but let's be explicit
|
||||
writeln!(file, "hot_standby=on")?;
|
||||
|
||||
// Inform the replica about the primary state
|
||||
// Default is 'false'
|
||||
if let Some(primary_is_running) = spec.primary_is_running {
|
||||
writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use crate::compute::forward_termination_signal;
|
||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_api::requests::ConfigurationRequest;
|
||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||
@@ -123,6 +124,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/terminate") => {
|
||||
info!("serving /terminate POST request");
|
||||
match handle_terminate_request(compute).await {
|
||||
Ok(()) => Response::new(Body::empty()),
|
||||
Err((msg, code)) => {
|
||||
error!("error handling /terminate request: {msg}");
|
||||
render_json_error(&msg, code)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
@@ -297,6 +309,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
|
||||
{
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
if state.status == ComputeStatus::Terminated {
|
||||
return Ok(());
|
||||
}
|
||||
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for termination request: {:?}",
|
||||
state.status.clone()
|
||||
);
|
||||
return Err((msg, StatusCode::PRECONDITION_FAILED));
|
||||
}
|
||||
state.status = ComputeStatus::TerminationPending;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
}
|
||||
forward_termination_signal();
|
||||
info!("sent signal and notified waiters");
|
||||
|
||||
// Spawn a blocking thread to wait for compute to become Terminated.
|
||||
// This is needed to do not block the main pool of workers and
|
||||
// be able to serve other requests while some particular request
|
||||
// is waiting for compute to finish configuration.
|
||||
let c = compute.clone();
|
||||
task::spawn_blocking(move || {
|
||||
let mut state = c.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::Terminated {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become Terminated, current status: {:?}",
|
||||
state.status
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.unwrap()?;
|
||||
info!("terminated Postgres");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(port: u16, state: Arc<ComputeNode>) {
|
||||
|
||||
@@ -168,6 +168,29 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
/terminate:
|
||||
post:
|
||||
tags:
|
||||
- Terminate
|
||||
summary: Terminate Postgres and wait for it to exit
|
||||
description: ""
|
||||
operationId: terminate
|
||||
responses:
|
||||
200:
|
||||
description: Result
|
||||
412:
|
||||
description: "wrong state"
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: "Unexpected error"
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
|
||||
@@ -655,6 +655,9 @@ pub fn handle_grants(
|
||||
// remove this code if possible. The worst thing that could happen is that
|
||||
// user won't be able to use public schema in NEW databases created in the
|
||||
// very OLD project.
|
||||
//
|
||||
// Also, alter default permissions so that relations created by extensions can be
|
||||
// used by neon_superuser without permission issues.
|
||||
let grant_query = "DO $$\n\
|
||||
BEGIN\n\
|
||||
IF EXISTS(\n\
|
||||
@@ -673,6 +676,8 @@ pub fn handle_grants(
|
||||
GRANT CREATE ON SCHEMA public TO web_access;\n\
|
||||
END IF;\n\
|
||||
END IF;\n\
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
|
||||
END\n\
|
||||
$$;"
|
||||
.to_string();
|
||||
@@ -777,6 +782,12 @@ BEGIN
|
||||
END
|
||||
$$;"#,
|
||||
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
|
||||
// Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
@@ -803,8 +814,13 @@ $$;"#,
|
||||
client.simple_query(query)?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
info!("Running migration:\n{}\n", migrations[current_migration]);
|
||||
client.simple_query(migrations[current_migration])?;
|
||||
let migration = &migrations[current_migration];
|
||||
if migration.is_empty() {
|
||||
info!("Skip migration id={}", current_migration);
|
||||
} else {
|
||||
info!("Running migration:\n{}\n", migration);
|
||||
client.simple_query(migration)?;
|
||||
}
|
||||
current_migration += 1;
|
||||
}
|
||||
let setval = format!(
|
||||
|
||||
@@ -4,6 +4,11 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test-only APIs and behaviors
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
aws-config.workspace = true
|
||||
@@ -13,6 +18,8 @@ clap.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
|
||||
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
|
||||
use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -77,7 +77,7 @@ impl ComputeHookTenant {
|
||||
self.shards
|
||||
.sort_by_key(|(shard, _node_id)| shard.shard_number);
|
||||
|
||||
if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
|
||||
if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
|
||||
// We have pageservers for all the shards: emit a configuration update
|
||||
return Some(ComputeHookNotifyRequest {
|
||||
tenant_id,
|
||||
@@ -94,7 +94,7 @@ impl ComputeHookTenant {
|
||||
tracing::info!(
|
||||
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
|
||||
self.shards.len(),
|
||||
shard_count.0
|
||||
shard_count.count()
|
||||
);
|
||||
}
|
||||
|
||||
@@ -155,7 +155,7 @@ impl ComputeHook {
|
||||
|
||||
for (endpoint_name, endpoint) in &cplane.endpoints {
|
||||
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
|
||||
tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
|
||||
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
|
||||
endpoint.reconfigure(compute_pageservers.clone()).await?;
|
||||
}
|
||||
}
|
||||
@@ -177,7 +177,7 @@ impl ComputeHook {
|
||||
req
|
||||
};
|
||||
|
||||
tracing::debug!(
|
||||
tracing::info!(
|
||||
"Sending notify request to {} ({:?})",
|
||||
url,
|
||||
reconfigure_request
|
||||
@@ -266,7 +266,7 @@ impl ComputeHook {
|
||||
/// periods, but we don't retry forever. The **caller** is responsible for handling failures and
|
||||
/// ensuring that they eventually call again to ensure that the compute is eventually notified of
|
||||
/// the proper pageserver nodes for a tenant.
|
||||
#[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
|
||||
pub(super) async fn notify(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -298,7 +298,7 @@ impl ComputeHook {
|
||||
let Some(reconfigure_request) = reconfigure_request else {
|
||||
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
|
||||
// until it does.
|
||||
tracing::debug!("Tenant isn't yet ready to emit a notification",);
|
||||
tracing::info!("Tenant isn't yet ready to emit a notification");
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{
|
||||
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TimelineCreateRequest,
|
||||
TenantTimeTravelRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
@@ -12,7 +12,7 @@ use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
use utils::http::endpoint::{auth_middleware, request_span};
|
||||
use utils::http::request::parse_request_param;
|
||||
use utils::http::request::{must_get_query_param, parse_request_param};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
@@ -66,14 +66,7 @@ fn get_state(request: &Request<Body>) -> &HttpState {
|
||||
async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
|
||||
let state = get_state(&req);
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state
|
||||
.service
|
||||
.re_attach(reattach_req)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
|
||||
}
|
||||
|
||||
/// Pageserver calls into this before doing deletions, to confirm that it still
|
||||
@@ -114,7 +107,10 @@ async fn handle_tenant_create(
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
service.tenant_create(create_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
|
||||
@@ -177,6 +173,39 @@ async fn handle_tenant_location_config(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_time_travel_remote_storage(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
|
||||
|
||||
let timestamp_raw = must_get_query_param(&req, "travel_to")?;
|
||||
let _timestamp = humantime::parse_rfc3339(×tamp_raw).map_err(|_e| {
|
||||
ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Invalid time for travel_to: {timestamp_raw:?}"
|
||||
))
|
||||
})?;
|
||||
|
||||
let done_if_after_raw = must_get_query_param(&req, "done_if_after")?;
|
||||
let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| {
|
||||
ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Invalid time for done_if_after: {done_if_after_raw:?}"
|
||||
))
|
||||
})?;
|
||||
|
||||
service
|
||||
.tenant_time_travel_remote_storage(
|
||||
&time_travel_req,
|
||||
tenant_id,
|
||||
timestamp_raw,
|
||||
done_if_after_raw,
|
||||
)
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -196,7 +225,7 @@ async fn handle_tenant_timeline_create(
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
StatusCode::CREATED,
|
||||
service
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?,
|
||||
@@ -296,7 +325,10 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
}
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.node_configure(config_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_shard_split(
|
||||
@@ -333,6 +365,22 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
|
||||
}
|
||||
|
||||
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
state.service.tenants_dump()
|
||||
}
|
||||
|
||||
async fn handle_scheduler_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
state.service.scheduler_dump()
|
||||
}
|
||||
|
||||
async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
|
||||
json_response(StatusCode::OK, state.service.consistency_check().await?)
|
||||
}
|
||||
|
||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -421,6 +469,13 @@ pub fn make_router(
|
||||
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||
request_span(r, handle_node_drop)
|
||||
})
|
||||
.get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
|
||||
.get("/debug/v1/scheduler", |r| {
|
||||
request_span(r, handle_scheduler_dump)
|
||||
})
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
@@ -451,6 +506,9 @@ pub fn make_router(
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
|
||||
@@ -3,6 +3,7 @@ use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod compute_hook;
|
||||
pub mod http;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
pub mod persistence;
|
||||
mod reconciler;
|
||||
@@ -11,7 +12,7 @@ mod schema;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize, Deserialize, Debug)]
|
||||
enum PlacementPolicy {
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
@@ -22,7 +23,7 @@ enum PlacementPolicy {
|
||||
Detached,
|
||||
}
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
|
||||
struct Sequence(u64);
|
||||
|
||||
impl Sequence {
|
||||
@@ -37,6 +38,12 @@ impl std::fmt::Display for Sequence {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Sequence {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl MonotonicCounter<Sequence> for Sequence {
|
||||
fn cnt_advance(&mut self, v: Sequence) {
|
||||
assert!(*self <= v);
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
///
|
||||
use anyhow::{anyhow, Context};
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service};
|
||||
use aws_config::{self, BehaviorVersion, Region};
|
||||
@@ -15,6 +16,7 @@ use diesel::Connection;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use std::sync::Arc;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::logging::{self, LogFormat};
|
||||
|
||||
@@ -204,6 +206,8 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
preinitialize_metrics();
|
||||
|
||||
let args = Cli::parse();
|
||||
tracing::info!(
|
||||
"version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
|
||||
@@ -237,15 +241,23 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
let auth = secrets
|
||||
.public_key
|
||||
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
|
||||
let router = make_router(service, auth)
|
||||
let router = make_router(service.clone(), auth)
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let router_service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
|
||||
|
||||
// Start HTTP server
|
||||
let server_shutdown = CancellationToken::new();
|
||||
let server = hyper::Server::from_tcp(http_listener)?
|
||||
.serve(router_service)
|
||||
.with_graceful_shutdown({
|
||||
let server_shutdown = server_shutdown.clone();
|
||||
async move {
|
||||
server_shutdown.cancelled().await;
|
||||
}
|
||||
});
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
|
||||
tokio::task::spawn(server);
|
||||
let server_task = tokio::task::spawn(server);
|
||||
|
||||
// Wait until we receive a signal
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
||||
@@ -266,5 +278,16 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// Stop HTTP server first, so that we don't have to service requests
|
||||
// while shutting down Service
|
||||
server_shutdown.cancel();
|
||||
if let Err(e) = server_task.await {
|
||||
tracing::error!("Error joining HTTP server task: {e}")
|
||||
}
|
||||
tracing::info!("Joined HTTP server task");
|
||||
|
||||
service.shutdown().await;
|
||||
tracing::info!("Service shutdown complete");
|
||||
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
||||
32
control_plane/attachment_service/src/metrics.rs
Normal file
32
control_plane/attachment_service/src/metrics.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(crate) struct ReconcilerMetrics {
|
||||
pub(crate) spawned: IntCounter,
|
||||
pub(crate) complete: IntCounterVec,
|
||||
}
|
||||
|
||||
impl ReconcilerMetrics {
|
||||
// Labels used on [`Self::complete`]
|
||||
pub(crate) const SUCCESS: &'static str = "ok";
|
||||
pub(crate) const ERROR: &'static str = "success";
|
||||
pub(crate) const CANCEL: &'static str = "cancel";
|
||||
}
|
||||
|
||||
pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
|
||||
spawned: register_int_counter!(
|
||||
"storage_controller_reconcile_spawn",
|
||||
"Count of how many times we spawn a reconcile task",
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
complete: register_int_counter_vec!(
|
||||
"storage_controller_reconcile_complete",
|
||||
"Reconciler tasks completed, broken down by success/failure/cancelled",
|
||||
&["status"],
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
Lazy::force(&RECONCILER);
|
||||
}
|
||||
@@ -1,9 +1,16 @@
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use serde::Serialize;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::persistence::NodePersistence;
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Represents the in-memory description of a Node.
|
||||
///
|
||||
/// Scheduling statistics are maintened separately in [`crate::scheduler`].
|
||||
///
|
||||
/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
|
||||
/// implementation of serialization on this type is only for debug dumps.
|
||||
#[derive(Clone, Serialize)]
|
||||
pub(crate) struct Node {
|
||||
pub(crate) id: NodeId,
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::time::Duration;
|
||||
use self::split_state::SplitState;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use control_plane::attachment_service::NodeSchedulingPolicy;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
@@ -130,24 +130,10 @@ impl Persistence {
|
||||
}
|
||||
|
||||
/// At startup, populate the list of nodes which our shards may be placed on
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
|
||||
let nodes: Vec<Node> = self
|
||||
pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
|
||||
let nodes: Vec<NodePersistence> = self
|
||||
.with_conn(move |conn| -> DatabaseResult<_> {
|
||||
Ok(crate::schema::nodes::table
|
||||
.load::<NodePersistence>(conn)?
|
||||
.into_iter()
|
||||
.map(|n| Node {
|
||||
id: NodeId(n.node_id as u64),
|
||||
// At startup we consider a node offline until proven otherwise.
|
||||
availability: NodeAvailability::Offline,
|
||||
scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
|
||||
.expect("Bad scheduling policy in DB"),
|
||||
listen_http_addr: n.listen_http_addr,
|
||||
listen_http_port: n.listen_http_port as u16,
|
||||
listen_pg_addr: n.listen_pg_addr,
|
||||
listen_pg_port: n.listen_pg_port as u16,
|
||||
})
|
||||
.collect::<Vec<Node>>())
|
||||
Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
|
||||
})
|
||||
.await?;
|
||||
|
||||
@@ -156,6 +142,31 @@ impl Persistence {
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
pub(crate) async fn update_node(
|
||||
&self,
|
||||
input_node_id: NodeId,
|
||||
input_scheduling: NodeSchedulingPolicy,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::nodes::dsl::*;
|
||||
let updated = self
|
||||
.with_conn(move |conn| {
|
||||
let updated = diesel::update(nodes)
|
||||
.filter(node_id.eq(input_node_id.0 as i64))
|
||||
.set((scheduling_policy.eq(String::from(input_scheduling)),))
|
||||
.execute(conn)?;
|
||||
Ok(updated)
|
||||
})
|
||||
.await?;
|
||||
|
||||
if updated != 1 {
|
||||
Err(DatabaseError::Logical(format!(
|
||||
"Node {node_id:?} not found for update",
|
||||
)))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// At startup, load the high level state for shards, such as their config + policy. This will
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
@@ -222,7 +233,7 @@ impl Persistence {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
};
|
||||
|
||||
tenants_map.insert(tenant_shard_id, tsp);
|
||||
@@ -318,7 +329,7 @@ impl Persistence {
|
||||
tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
|
||||
.map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount(tsp.shard_count as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
|
||||
}
|
||||
@@ -340,7 +351,7 @@ impl Persistence {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set((
|
||||
generation.eq(generation + 1),
|
||||
generation_pageserver.eq(node_id.0 as i64),
|
||||
@@ -362,7 +373,7 @@ impl Persistence {
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set((
|
||||
generation_pageserver.eq(i64::MAX),
|
||||
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
|
||||
@@ -381,7 +392,6 @@ impl Persistence {
|
||||
//
|
||||
// We create the child shards here, so that they will be available for increment_generation calls
|
||||
// if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn begin_shard_split(
|
||||
&self,
|
||||
old_shard_count: ShardCount,
|
||||
@@ -393,21 +403,19 @@ impl Persistence {
|
||||
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||
// Mark parent shards as splitting
|
||||
|
||||
let expect_parent_records = std::cmp::max(1, old_shard_count.0);
|
||||
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.0 as i32))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.set((splitting.eq(1),))
|
||||
.execute(conn)?;
|
||||
if u8::try_from(updated)
|
||||
.map_err(|_| DatabaseError::Logical(
|
||||
format!("Overflow existing shard count {} while splitting", updated))
|
||||
)? != expect_parent_records {
|
||||
)? != old_shard_count.count() {
|
||||
// Perhaps a deletion or another split raced with this attempt to split, mutating
|
||||
// the parent shards that we intend to split. In this case the split request should fail.
|
||||
return Err(DatabaseError::Logical(
|
||||
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
|
||||
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
|
||||
));
|
||||
}
|
||||
|
||||
@@ -419,7 +427,7 @@ impl Persistence {
|
||||
let mut parent = crate::schema::tenant_shards::table
|
||||
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
|
||||
.filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
|
||||
.load::<TenantShardPersistence>(conn)?;
|
||||
let parent = if parent.len() != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
@@ -449,7 +457,6 @@ impl Persistence {
|
||||
|
||||
// When we finish shard splitting, we must atomically clean up the old shards
|
||||
// and insert the new shards, and clear the splitting marker.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn complete_shard_split(
|
||||
&self,
|
||||
split_tenant_id: TenantId,
|
||||
@@ -461,7 +468,7 @@ impl Persistence {
|
||||
// Drop parent shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(old_shard_count.0 as i32))
|
||||
.filter(shard_count.eq(old_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
// Clear sharding flag
|
||||
@@ -481,7 +488,7 @@ impl Persistence {
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
|
||||
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
pub(crate) struct TenantShardPersistence {
|
||||
#[serde(default)]
|
||||
@@ -510,7 +517,7 @@ pub(crate) struct TenantShardPersistence {
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
pub(crate) struct NodePersistence {
|
||||
pub(crate) node_id: i64,
|
||||
|
||||
@@ -13,6 +13,7 @@ use tokio_util::sync::CancellationToken;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
@@ -26,7 +27,7 @@ pub(super) struct Reconciler {
|
||||
pub(super) tenant_shard_id: TenantShardId,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) intent: IntentState,
|
||||
pub(crate) intent: TargetState,
|
||||
pub(crate) config: TenantConfig,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
@@ -53,14 +54,46 @@ pub(super) struct Reconciler {
|
||||
/// the tenant is changed.
|
||||
pub(crate) cancel: CancellationToken,
|
||||
|
||||
/// Reconcilers are registered with a Gate so that during a graceful shutdown we
|
||||
/// can wait for all the reconcilers to respond to their cancellation tokens.
|
||||
pub(crate) _gate_guard: GateGuard,
|
||||
|
||||
/// Access to persistent storage for updating generation numbers
|
||||
pub(crate) persistence: Arc<Persistence>,
|
||||
}
|
||||
|
||||
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
|
||||
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
|
||||
/// and the TargetState is just the instruction for a particular Reconciler run.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct TargetState {
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
impl TargetState {
|
||||
pub(crate) fn from_intent(intent: &IntentState) -> Self {
|
||||
Self {
|
||||
attached: *intent.get_attached(),
|
||||
secondary: intent.get_secondary().clone(),
|
||||
}
|
||||
}
|
||||
|
||||
fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = self.secondary.clone();
|
||||
if let Some(node_id) = &self.attached {
|
||||
result.push(*node_id);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum ReconcileError {
|
||||
#[error(transparent)]
|
||||
Notify(#[from] NotifyError),
|
||||
#[error("Cancelled")]
|
||||
Cancel,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
@@ -263,7 +296,7 @@ impl Reconciler {
|
||||
secondary_conf,
|
||||
tenant_conf: config.clone(),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
}
|
||||
}
|
||||
@@ -405,18 +438,49 @@ impl Reconciler {
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!("Observed configuration already correct.")
|
||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
||||
}
|
||||
_ => {
|
||||
observed => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
// reconcile this location. This includes locations with different configurations, as well
|
||||
// as locations with unknown (None) observed state.
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
wanted_conf.generation = self.generation.into();
|
||||
tracing::info!("Observed configuration requires update.");
|
||||
|
||||
// The general case is to increment the generation. However, there are cases
|
||||
// where this is not necessary:
|
||||
// - if we are only updating the TenantConf part of the location
|
||||
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
|
||||
// and the location was already in the correct generation
|
||||
let increment_generation = match observed {
|
||||
None => true,
|
||||
Some(ObservedStateLocation { conf: None }) => true,
|
||||
Some(ObservedStateLocation {
|
||||
conf: Some(observed),
|
||||
}) => {
|
||||
// If mode and generation are the same, it follows that only the configuration has changed
|
||||
let config_update = observed.generation == wanted_conf.generation
|
||||
&& observed.mode == wanted_conf.mode;
|
||||
|
||||
// Usually the short-lived attachment modes (multi and stale) are only used
|
||||
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
|
||||
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
|
||||
let mode_transition = observed.generation == wanted_conf.generation
|
||||
&& ((observed.mode == LocationConfigMode::AttachedSingle
|
||||
&& wanted_conf.mode == LocationConfigMode::AttachedStale)
|
||||
|| (observed.mode == LocationConfigMode::AttachedMulti
|
||||
&& wanted_conf.mode == LocationConfigMode::AttachedSingle));
|
||||
|
||||
!(config_update || mode_transition)
|
||||
}
|
||||
};
|
||||
|
||||
if increment_generation {
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
wanted_conf.generation = self.generation.into();
|
||||
}
|
||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
||||
self.location_config(node_id, wanted_conf, None).await?;
|
||||
self.compute_notify().await?;
|
||||
}
|
||||
@@ -458,7 +522,7 @@ impl Reconciler {
|
||||
generation: None,
|
||||
secondary_conf: None,
|
||||
shard_number: self.shard.number.0,
|
||||
shard_count: self.shard.count.0,
|
||||
shard_count: self.shard.count.literal(),
|
||||
shard_stripe_size: self.shard.stripe_size.0,
|
||||
tenant_conf: self.config.clone(),
|
||||
},
|
||||
@@ -466,6 +530,9 @@ impl Reconciler {
|
||||
}
|
||||
|
||||
for (node_id, conf) in changes {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(ReconcileError::Cancel);
|
||||
}
|
||||
self.location_config(node_id, conf, None).await?;
|
||||
}
|
||||
|
||||
@@ -506,7 +573,7 @@ pub(crate) fn attached_location_conf(
|
||||
generation: generation.into(),
|
||||
secondary_conf: None,
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
@@ -521,7 +588,7 @@ pub(crate) fn secondary_location_conf(
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
}
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
|
||||
/// Scenarios in which we cannot find a suitable location for a tenant shard
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@@ -19,52 +18,203 @@ impl From<ScheduleError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Eq, PartialEq)]
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||
shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
may_schedule: bool,
|
||||
}
|
||||
|
||||
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
||||
/// on which to run.
|
||||
///
|
||||
/// The type has no persistent state of its own: this is all populated at startup. The Serialize
|
||||
/// impl is only for debug dumps.
|
||||
#[derive(Serialize)]
|
||||
pub(crate) struct Scheduler {
|
||||
tenant_counts: HashMap<NodeId, usize>,
|
||||
nodes: HashMap<NodeId, SchedulerNode>,
|
||||
}
|
||||
|
||||
impl Scheduler {
|
||||
pub(crate) fn new(
|
||||
tenants: &BTreeMap<TenantShardId, TenantState>,
|
||||
nodes: &HashMap<NodeId, Node>,
|
||||
) -> Self {
|
||||
let mut tenant_counts = HashMap::new();
|
||||
for node_id in nodes.keys() {
|
||||
tenant_counts.insert(*node_id, 0);
|
||||
pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
|
||||
let mut scheduler_nodes = HashMap::new();
|
||||
for node in nodes {
|
||||
scheduler_nodes.insert(
|
||||
node.id,
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
for tenant in tenants.values() {
|
||||
if let Some(ps) = tenant.intent.attached {
|
||||
let entry = tenant_counts.entry(ps).or_insert(0);
|
||||
*entry += 1;
|
||||
}
|
||||
Self {
|
||||
nodes: scheduler_nodes,
|
||||
}
|
||||
|
||||
for (node_id, node) in nodes {
|
||||
if !node.may_schedule() {
|
||||
tenant_counts.remove(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
Self { tenant_counts }
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(
|
||||
&mut self,
|
||||
hard_exclude: &[NodeId],
|
||||
) -> Result<NodeId, ScheduleError> {
|
||||
if self.tenant_counts.is_empty() {
|
||||
/// For debug/support: check that our internal statistics are in sync with the state of
|
||||
/// the nodes & tenant shards.
|
||||
///
|
||||
/// If anything is inconsistent, log details and return an error.
|
||||
pub(crate) fn consistency_check<'a>(
|
||||
&self,
|
||||
nodes: impl Iterator<Item = &'a Node>,
|
||||
shards: impl Iterator<Item = &'a TenantState>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
|
||||
for node in nodes {
|
||||
expect_nodes.insert(
|
||||
node.id,
|
||||
SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
for shard in shards {
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => node.shard_count += 1,
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
node_id
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
for node_id in shard.intent.get_secondary() {
|
||||
match expect_nodes.get_mut(node_id) {
|
||||
Some(node) => node.shard_count += 1,
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} references nonexistent node {}",
|
||||
shard.tenant_shard_id,
|
||||
node_id
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (node_id, expect_node) in &expect_nodes {
|
||||
let Some(self_node) = self.nodes.get(node_id) else {
|
||||
anyhow::bail!("Node {node_id} not found in Self")
|
||||
};
|
||||
|
||||
if self_node != expect_node {
|
||||
tracing::error!("Inconsistency detected in scheduling state for node {node_id}");
|
||||
tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?);
|
||||
tracing::error!("Self state: {}", serde_json::to_string(self_node)?);
|
||||
|
||||
anyhow::bail!("Inconsistent state on {node_id}");
|
||||
}
|
||||
}
|
||||
|
||||
if expect_nodes.len() != self.nodes.len() {
|
||||
// We just checked that all the expected nodes are present. If the lengths don't match,
|
||||
// it means that we have nodes in Self that are unexpected.
|
||||
for node_id in self.nodes.keys() {
|
||||
if !expect_nodes.contains_key(node_id) {
|
||||
anyhow::bail!("Node {node_id} found in Self but not in expected nodes");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Increment the reference count of a node. This reference count is used to guide scheduling
|
||||
/// decisions, not for memory management: it represents one tenant shard whose IntentState targets
|
||||
/// this node.
|
||||
///
|
||||
/// It is an error to call this for a node that is not known to the scheduler (i.e. passed into
|
||||
/// [`Self::new`] or [`Self::node_upsert`])
|
||||
pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
debug_assert!(false);
|
||||
return;
|
||||
};
|
||||
|
||||
node.shard_count += 1;
|
||||
}
|
||||
|
||||
/// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`].
|
||||
pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) {
|
||||
let Some(node) = self.nodes.get_mut(&node_id) else {
|
||||
debug_assert!(false);
|
||||
tracing::error!("Scheduler missing node {node_id}");
|
||||
return;
|
||||
};
|
||||
|
||||
node.shard_count -= 1;
|
||||
}
|
||||
|
||||
pub(crate) fn node_upsert(&mut self, node: &Node) {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
match self.nodes.entry(node.id) {
|
||||
Occupied(mut entry) => {
|
||||
entry.get_mut().may_schedule = node.may_schedule();
|
||||
}
|
||||
Vacant(entry) => {
|
||||
entry.insert(SchedulerNode {
|
||||
shard_count: 0,
|
||||
may_schedule: node.may_schedule(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn node_remove(&mut self, node_id: NodeId) {
|
||||
if self.nodes.remove(&node_id).is_none() {
|
||||
tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler");
|
||||
}
|
||||
}
|
||||
|
||||
/// Where we have several nodes to choose from, for example when picking a secondary location
|
||||
/// to promote to an attached location, this method may be used to pick the best choice based
|
||||
/// on the scheduler's knowledge of utilization and availability.
|
||||
///
|
||||
/// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
|
||||
/// caller can pick a node some other way.
|
||||
pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
|
||||
if nodes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let node = nodes
|
||||
.iter()
|
||||
.map(|node_id| {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule)
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
.max_by_key(|(_n, may_schedule)| *may_schedule);
|
||||
|
||||
// If even the preferred node has may_schedule==false, return None
|
||||
node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
|
||||
}
|
||||
|
||||
pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
|
||||
if self.nodes.is_empty() {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut tenant_counts: Vec<(NodeId, usize)> = self
|
||||
.tenant_counts
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) {
|
||||
if hard_exclude.contains(k) || !v.may_schedule {
|
||||
None
|
||||
} else {
|
||||
Some((*k, *v))
|
||||
Some((*k, v.shard_count))
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
@@ -73,17 +223,109 @@ impl Scheduler {
|
||||
tenant_counts.sort_by_key(|i| (i.1, i.0));
|
||||
|
||||
if tenant_counts.is_empty() {
|
||||
// After applying constraints, no pageservers were left
|
||||
// After applying constraints, no pageservers were left. We log some detail about
|
||||
// the state of nodes to help understand why this happened. This is not logged as an error because
|
||||
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
|
||||
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
for (node_id, count) in &tenant_counts {
|
||||
tracing::info!("tenant_counts[{node_id}]={count}");
|
||||
}
|
||||
|
||||
let node_id = tenant_counts.first().unwrap().0;
|
||||
tracing::info!("scheduler selected node {node_id}");
|
||||
*self.tenant_counts.get_mut(&node_id).unwrap() += 1;
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
|
||||
tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
// is IntentState's job when the scheduled location is used.
|
||||
|
||||
Ok(node_id)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test_utils {
|
||||
|
||||
use crate::node::Node;
|
||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||
use std::collections::HashMap;
|
||||
use utils::id::NodeId;
|
||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||
///
|
||||
/// Node IDs start at one.
|
||||
pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
|
||||
(1..n + 1)
|
||||
.map(|i| {
|
||||
(
|
||||
NodeId(i),
|
||||
Node {
|
||||
id: NodeId(i),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
listen_http_addr: format!("httphost-{i}"),
|
||||
listen_http_port: 80 + i as u16,
|
||||
listen_pg_addr: format!("pghost-{i}"),
|
||||
listen_pg_port: 5432 + i as u16,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::tenant_state::IntentState;
|
||||
#[test]
|
||||
fn scheduler_basic() -> anyhow::Result<()> {
|
||||
let nodes = test_utils::make_test_nodes(2);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
let mut t1_intent = IntentState::new();
|
||||
let mut t2_intent = IntentState::new();
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t1_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
let scheduled = scheduler.schedule_shard(&[])?;
|
||||
t2_intent.set_attached(&mut scheduler, Some(scheduled));
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
|
||||
t1_intent.push_secondary(&mut scheduler, scheduled);
|
||||
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2);
|
||||
|
||||
t1_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
// Dropping an IntentState without clearing it causes a panic in debug mode,
|
||||
// because we have failed to properly update scheduler shard counts.
|
||||
let result = std::panic::catch_unwind(move || {
|
||||
drop(t2_intent);
|
||||
});
|
||||
assert!(result.is_err());
|
||||
} else {
|
||||
t2_intent.clear(&mut scheduler);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0);
|
||||
assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,27 +1,47 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use crate::{metrics, persistence::TenantShardPersistence};
|
||||
use control_plane::attachment_service::NodeAvailability;
|
||||
use pageserver_api::{
|
||||
models::{LocationConfig, LocationConfigMode, TenantConfig},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
};
|
||||
use serde::Serialize;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{instrument, Instrument};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::NodeId,
|
||||
seqwait::{SeqWait, SeqWaitError},
|
||||
sync::gate::Gate,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
persistence::Persistence,
|
||||
reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
|
||||
persistence::{split_state::SplitState, Persistence},
|
||||
reconciler::{
|
||||
attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState,
|
||||
},
|
||||
scheduler::{ScheduleError, Scheduler},
|
||||
service, PlacementPolicy, Sequence,
|
||||
};
|
||||
|
||||
/// Serialization helper
|
||||
fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::ser::Serializer,
|
||||
T: Clone + std::fmt::Display,
|
||||
{
|
||||
serializer.collect_str(&v.lock().unwrap())
|
||||
}
|
||||
|
||||
/// In-memory state for a particular tenant shard.
|
||||
///
|
||||
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
|
||||
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
|
||||
#[derive(Serialize)]
|
||||
pub(crate) struct TenantState {
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
|
||||
@@ -56,20 +76,29 @@ pub(crate) struct TenantState {
|
||||
/// If a reconcile task is currently in flight, it may be joined here (it is
|
||||
/// only safe to join if either the result has been received or the reconciler's
|
||||
/// cancellation token has been fired)
|
||||
#[serde(skip)]
|
||||
pub(crate) reconciler: Option<ReconcilerHandle>,
|
||||
|
||||
/// If a tenant is being split, then all shards with that TenantId will have a
|
||||
/// SplitState set, this acts as a guard against other operations such as background
|
||||
/// reconciliation, and timeline creation.
|
||||
pub(crate) splitting: SplitState,
|
||||
|
||||
/// Optionally wait for reconciliation to complete up to a particular
|
||||
/// sequence number.
|
||||
#[serde(skip)]
|
||||
pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
|
||||
/// Indicates sequence number for which we have encountered an error reconciling. If
|
||||
/// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
|
||||
/// and callers should stop waiting for `waiter` and propagate the error.
|
||||
#[serde(skip)]
|
||||
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
|
||||
|
||||
/// The most recent error from a reconcile on this tenant
|
||||
/// TODO: generalize to an array of recent events
|
||||
/// TOOD: use a ArcSwap instead of mutex for faster reads?
|
||||
#[serde(serialize_with = "read_mutex_content")]
|
||||
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
|
||||
|
||||
/// If we have a pending compute notification that for some reason we weren't able to send,
|
||||
@@ -79,13 +108,131 @@ pub(crate) struct TenantState {
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug)]
|
||||
#[derive(Default, Clone, Debug, Serialize)]
|
||||
pub(crate) struct IntentState {
|
||||
pub(crate) attached: Option<NodeId>,
|
||||
pub(crate) secondary: Vec<NodeId>,
|
||||
attached: Option<NodeId>,
|
||||
secondary: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
impl IntentState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
attached: None,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option<NodeId>) -> Self {
|
||||
if let Some(node_id) = node_id {
|
||||
scheduler.node_inc_ref(node_id);
|
||||
}
|
||||
Self {
|
||||
attached: node_id,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option<NodeId>) {
|
||||
if self.attached != new_attached {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
}
|
||||
if let Some(new_attached) = &new_attached {
|
||||
scheduler.node_inc_ref(*new_attached);
|
||||
}
|
||||
self.attached = new_attached;
|
||||
}
|
||||
}
|
||||
|
||||
/// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from
|
||||
/// secondary to attached while maintaining the scheduler's reference counts.
|
||||
pub(crate) fn promote_attached(
|
||||
&mut self,
|
||||
_scheduler: &mut Scheduler,
|
||||
promote_secondary: NodeId,
|
||||
) {
|
||||
// If we call this with a node that isn't in secondary, it would cause incorrect
|
||||
// scheduler reference counting, since we assume the node is already referenced as a secondary.
|
||||
debug_assert!(self.secondary.contains(&promote_secondary));
|
||||
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.secondary.retain(|n| n != &promote_secondary);
|
||||
self.attached = Some(promote_secondary);
|
||||
}
|
||||
|
||||
pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
|
||||
debug_assert!(!self.secondary.contains(&new_secondary));
|
||||
scheduler.node_inc_ref(new_secondary);
|
||||
self.secondary.push(new_secondary);
|
||||
}
|
||||
|
||||
/// It is legal to call this with a node that is not currently a secondary: that is a no-op
|
||||
pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) {
|
||||
let index = self.secondary.iter().position(|n| *n == node_id);
|
||||
if let Some(index) = index {
|
||||
scheduler.node_dec_ref(node_id);
|
||||
self.secondary.remove(index);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
for secondary in self.secondary.drain(..) {
|
||||
scheduler.node_dec_ref(secondary);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
}
|
||||
|
||||
self.clear_secondary(scheduler);
|
||||
}
|
||||
|
||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(p) = self.attached {
|
||||
result.push(p)
|
||||
}
|
||||
|
||||
result.extend(self.secondary.iter().copied());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub(crate) fn get_attached(&self) -> &Option<NodeId> {
|
||||
&self.attached
|
||||
}
|
||||
|
||||
pub(crate) fn get_secondary(&self) -> &Vec<NodeId> {
|
||||
&self.secondary
|
||||
}
|
||||
|
||||
/// When a node goes offline, we update intents to avoid using it
|
||||
/// as their attached pageserver.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IntentState {
|
||||
fn drop(&mut self) {
|
||||
// Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
|
||||
debug_assert!(self.attached.is_none() && self.secondary.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Serialize)]
|
||||
pub(crate) struct ObservedState {
|
||||
pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
|
||||
}
|
||||
@@ -99,7 +246,7 @@ pub(crate) struct ObservedState {
|
||||
/// what it is (e.g. we failed partway through configuring it)
|
||||
/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
|
||||
/// and that configuration will still be present unless something external interfered.
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Serialize)]
|
||||
pub(crate) struct ObservedStateLocation {
|
||||
/// If None, it means we do not know the status of this shard's location on this node, but
|
||||
/// we know that we might have some state on this node.
|
||||
@@ -175,46 +322,6 @@ pub(crate) struct ReconcileResult {
|
||||
pub(crate) pending_compute_notification: bool,
|
||||
}
|
||||
|
||||
impl IntentState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
attached: None,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(p) = self.attached {
|
||||
result.push(p)
|
||||
}
|
||||
|
||||
result.extend(self.secondary.iter().copied());
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
pub(crate) fn single(node_id: Option<NodeId>) -> Self {
|
||||
Self {
|
||||
attached: node_id,
|
||||
secondary: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// When a node goes offline, we update intents to avoid using it
|
||||
/// as their attached pageserver.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
self.attached = None;
|
||||
self.secondary.push(node_id);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ObservedState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
@@ -238,6 +345,7 @@ impl TenantState {
|
||||
observed: ObservedState::default(),
|
||||
config: TenantConfig::default(),
|
||||
reconciler: None,
|
||||
splitting: SplitState::Idle,
|
||||
sequence: Sequence(1),
|
||||
waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
|
||||
@@ -281,6 +389,9 @@ impl TenantState {
|
||||
// All remaining observed locations generate secondary intents. This includes None
|
||||
// observations, as these may well have some local content on disk that is usable (this
|
||||
// is an edge case that might occur if we restarted during a migration or other change)
|
||||
//
|
||||
// We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
|
||||
// will take care of promoting one of these secondaries to be attached.
|
||||
self.observed.locations.keys().for_each(|node_id| {
|
||||
if Some(*node_id) != self.intent.attached {
|
||||
self.intent.secondary.push(*node_id);
|
||||
@@ -288,6 +399,33 @@ impl TenantState {
|
||||
});
|
||||
}
|
||||
|
||||
/// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
|
||||
/// attached pageserver for a shard.
|
||||
///
|
||||
/// Returns whether we modified it, and the NodeId selected.
|
||||
fn schedule_attached(
|
||||
&mut self,
|
||||
scheduler: &mut Scheduler,
|
||||
) -> Result<(bool, NodeId), ScheduleError> {
|
||||
// No work to do if we already have an attached tenant
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
return Ok((false, node_id));
|
||||
}
|
||||
|
||||
if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
|
||||
// Promote a secondary
|
||||
tracing::debug!("Promoted secondary {} to attached", promote_secondary);
|
||||
self.intent.promote_attached(scheduler, promote_secondary);
|
||||
Ok((true, promote_secondary))
|
||||
} else {
|
||||
// Pick a fresh node: either we had no secondaries or none were schedulable
|
||||
let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
|
||||
tracing::debug!("Selected {} as attached", node_id);
|
||||
self.intent.set_attached(scheduler, Some(node_id));
|
||||
Ok((true, node_id))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
|
||||
// TODO: before scheduling new nodes, check if any existing content in
|
||||
// self.intent refers to pageservers that are offline, and pick other
|
||||
@@ -298,36 +436,29 @@ impl TenantState {
|
||||
|
||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut used_pageservers = self.intent.all_pageservers();
|
||||
let mut modified = false;
|
||||
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
// Should have exactly one attached, and N secondaries
|
||||
if self.intent.attached.is_none() {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.attached = Some(node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
let mut used_pageservers = vec![attached_node_id];
|
||||
while self.intent.secondary.len() < secondary_count {
|
||||
let node_id = scheduler.schedule_shard(&used_pageservers)?;
|
||||
self.intent.secondary.push(node_id);
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
used_pageservers.push(node_id);
|
||||
modified = true;
|
||||
}
|
||||
@@ -335,12 +466,12 @@ impl TenantState {
|
||||
Detached => {
|
||||
// Should have no attached or secondary pageservers
|
||||
if self.intent.attached.is_some() {
|
||||
self.intent.attached = None;
|
||||
self.intent.set_attached(scheduler, None);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.secondary.clear();
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
@@ -406,6 +537,13 @@ impl TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
for node_id in self.observed.locations.keys() {
|
||||
if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
|
||||
// We have observed state that isn't part of our intent: need to clean it up.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
|
||||
// wake up a reconciler to send it.
|
||||
if self.pending_compute_notification {
|
||||
@@ -415,6 +553,8 @@ impl TenantState {
|
||||
false
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
@@ -422,6 +562,8 @@ impl TenantState {
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
service_config: &service::Config,
|
||||
persistence: &Arc<Persistence>,
|
||||
gate: &Gate,
|
||||
cancel: &CancellationToken,
|
||||
) -> Option<ReconcilerWaiter> {
|
||||
// If there are any ambiguous observed states, and the nodes they refer to are available,
|
||||
// we should reconcile to clean them up.
|
||||
@@ -443,6 +585,14 @@ impl TenantState {
|
||||
return None;
|
||||
}
|
||||
|
||||
// If we are currently splitting, then never start a reconciler task: the splitting logic
|
||||
// requires that shards are not interfered with while it runs. Do this check here rather than
|
||||
// up top, so that we only log this message if we would otherwise have done a reconciliation.
|
||||
if !matches!(self.splitting, SplitState::Idle) {
|
||||
tracing::info!("Refusing to reconcile, splitting in progress");
|
||||
return None;
|
||||
}
|
||||
|
||||
// Reconcile already in flight for the current sequence?
|
||||
if let Some(handle) = &self.reconciler {
|
||||
if handle.sequence == self.sequence {
|
||||
@@ -460,70 +610,105 @@ impl TenantState {
|
||||
// doing our sequence's work.
|
||||
let old_handle = self.reconciler.take();
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let Ok(gate_guard) = gate.enter() else {
|
||||
// Shutting down, don't start a reconciler
|
||||
return None;
|
||||
};
|
||||
|
||||
// Advance the sequence before spawning a reconciler, so that sequence waiters
|
||||
// can distinguish between before+after the reconcile completes.
|
||||
self.sequence = self.sequence.next();
|
||||
|
||||
let reconciler_cancel = cancel.child_token();
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
shard: self.shard,
|
||||
generation: self.generation,
|
||||
intent: self.intent.clone(),
|
||||
intent: TargetState::from_intent(&self.intent),
|
||||
config: self.config.clone(),
|
||||
observed: self.observed.clone(),
|
||||
pageservers: pageservers.clone(),
|
||||
compute_hook: compute_hook.clone(),
|
||||
service_config: service_config.clone(),
|
||||
cancel: cancel.clone(),
|
||||
_gate_guard: gate_guard,
|
||||
cancel: reconciler_cancel.clone(),
|
||||
persistence: persistence.clone(),
|
||||
compute_notify_failure: false,
|
||||
};
|
||||
|
||||
let reconcile_seq = self.sequence;
|
||||
|
||||
tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
|
||||
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
|
||||
let must_notify = self.pending_compute_notification;
|
||||
let join_handle = tokio::task::spawn(async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
if let Some(old_handle) = old_handle {
|
||||
old_handle.cancel.cancel();
|
||||
if let Err(e) = old_handle.handle.await {
|
||||
// We can't do much with this other than log it: the task is done, so
|
||||
// we may proceed with our work.
|
||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
||||
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
if let Some(old_handle) = old_handle {
|
||||
old_handle.cancel.cancel();
|
||||
if let Err(e) = old_handle.handle.await {
|
||||
// We can't do much with this other than log it: the task is done, so
|
||||
// we may proceed with our work.
|
||||
tracing::error!("Unexpected join error waiting for reconcile task: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
// Early check for cancellation before doing any work
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
|
||||
.inc();
|
||||
return;
|
||||
}
|
||||
|
||||
// Attempt to make observed state match intent state
|
||||
let result = reconciler.reconcile().await;
|
||||
|
||||
// If we know we had a pending compute notification from some previous action, send a notification irrespective
|
||||
// of whether the above reconcile() did any work
|
||||
if result.is_ok() && must_notify {
|
||||
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
|
||||
reconciler.compute_notify().await.ok();
|
||||
}
|
||||
|
||||
// Update result counter
|
||||
match &result {
|
||||
Ok(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
|
||||
Err(ReconcileError::Cancel) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
|
||||
Err(_) => metrics::RECONCILER
|
||||
.complete
|
||||
.with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
|
||||
}
|
||||
.inc();
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
result,
|
||||
tenant_shard_id: reconciler.tenant_shard_id,
|
||||
generation: reconciler.generation,
|
||||
observed: reconciler.observed,
|
||||
pending_compute_notification: reconciler.compute_notify_failure,
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
|
||||
// Early check for cancellation before doing any work
|
||||
// TODO: wrap all remote API operations in cancellation check
|
||||
// as well.
|
||||
if reconciler.cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Attempt to make observed state match intent state
|
||||
let result = reconciler.reconcile().await;
|
||||
|
||||
// If we know we had a pending compute notification from some previous action, send a notification irrespective
|
||||
// of whether the above reconcile() did any work
|
||||
if result.is_ok() && must_notify {
|
||||
// If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
|
||||
reconciler.compute_notify().await.ok();
|
||||
}
|
||||
|
||||
result_tx
|
||||
.send(ReconcileResult {
|
||||
sequence: reconcile_seq,
|
||||
result,
|
||||
tenant_shard_id: reconciler.tenant_shard_id,
|
||||
generation: reconciler.generation,
|
||||
observed: reconciler.observed,
|
||||
pending_compute_notification: reconciler.compute_notify_failure,
|
||||
})
|
||||
.ok();
|
||||
});
|
||||
.instrument(reconciler_span),
|
||||
);
|
||||
|
||||
self.reconciler = Some(ReconcilerHandle {
|
||||
sequence: self.sequence,
|
||||
handle: join_handle,
|
||||
cancel,
|
||||
cancel: reconciler_cancel,
|
||||
});
|
||||
|
||||
Some(ReconcilerWaiter {
|
||||
@@ -548,4 +733,103 @@ impl TenantState {
|
||||
|
||||
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
||||
}
|
||||
|
||||
pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
|
||||
TenantShardPersistence {
|
||||
tenant_id: self.tenant_shard_id.tenant_id.to_string(),
|
||||
shard_number: self.tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: self.shard.stripe_size.0 as i32,
|
||||
generation: self.generation.into().unwrap_or(0) as i32,
|
||||
generation_pageserver: self
|
||||
.intent
|
||||
.get_attached()
|
||||
.map(|n| n.0 as i64)
|
||||
.unwrap_or(i64::MAX),
|
||||
|
||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::scheduler::test_utils::make_test_nodes;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
|
||||
let tenant_id = TenantId::generate();
|
||||
let shard_number = ShardNumber(0);
|
||||
let shard_count = ShardCount::new(1);
|
||||
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
};
|
||||
TenantState::new(
|
||||
tenant_shard_id,
|
||||
ShardIdentity::new(
|
||||
shard_number,
|
||||
shard_count,
|
||||
pageserver_api::shard::ShardStripeSize(32768),
|
||||
)
|
||||
.unwrap(),
|
||||
policy,
|
||||
)
|
||||
}
|
||||
|
||||
/// Test the scheduling behaviors used when a tenant configured for HA is subject
|
||||
/// to nodes being marked offline.
|
||||
#[test]
|
||||
fn tenant_ha_scheduling() -> anyhow::Result<()> {
|
||||
// Start with three nodes. Our tenant will only use two. The third one is
|
||||
// expected to remain unused.
|
||||
let mut nodes = make_test_nodes(3);
|
||||
|
||||
let mut scheduler = Scheduler::new(nodes.values());
|
||||
|
||||
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("we have enough nodes, scheduling should work");
|
||||
|
||||
// Expect to initially be schedule on to different nodes
|
||||
assert_eq!(tenant_state.intent.secondary.len(), 1);
|
||||
assert!(tenant_state.intent.attached.is_some());
|
||||
|
||||
let attached_node_id = tenant_state.intent.attached.unwrap();
|
||||
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
|
||||
assert_ne!(attached_node_id, secondary_node_id);
|
||||
|
||||
// Notifying the attached node is offline should demote it to a secondary
|
||||
let changed = tenant_state.intent.notify_offline(attached_node_id);
|
||||
assert!(changed);
|
||||
|
||||
// Update the scheduler state to indicate the node is offline
|
||||
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
|
||||
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
|
||||
|
||||
// Scheduling the node should promote the still-available secondary node to attached
|
||||
tenant_state
|
||||
.schedule(&mut scheduler)
|
||||
.expect("active nodes are available");
|
||||
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
|
||||
|
||||
// The original attached node should have been retained as a secondary
|
||||
assert_eq!(
|
||||
*tenant_state.intent.secondary.iter().last().unwrap(),
|
||||
attached_node_id
|
||||
);
|
||||
|
||||
tenant_state.intent.clear(&mut scheduler);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,7 +113,7 @@ pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
@@ -137,7 +137,7 @@ impl FromStr for NodeAvailability {
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy)]
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
Filling,
|
||||
|
||||
@@ -450,7 +450,7 @@ async fn handle_tenant(
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters {
|
||||
count: ShardCount(shard_count),
|
||||
count: ShardCount::new(shard_count),
|
||||
stripe_size: shard_stripe_size
|
||||
.map(ShardStripeSize)
|
||||
.unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
|
||||
@@ -652,6 +652,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
let name = import_match
|
||||
.get_one::<String>("node-name")
|
||||
.ok_or_else(|| anyhow!("No node name provided"))?;
|
||||
let update_catalog = import_match
|
||||
.get_one::<bool>("update-catalog")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
// Parse base inputs
|
||||
let base_tarfile = import_match
|
||||
@@ -694,6 +698,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
None,
|
||||
pg_version,
|
||||
ComputeMode::Primary,
|
||||
!update_catalog,
|
||||
)?;
|
||||
println!("Done");
|
||||
}
|
||||
@@ -831,6 +836,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.get_one::<String>("endpoint_id")
|
||||
.map(String::to_string)
|
||||
.unwrap_or_else(|| format!("ep-{branch_name}"));
|
||||
let update_catalog = sub_args
|
||||
.get_one::<bool>("update-catalog")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
let lsn = sub_args
|
||||
.get_one::<String>("lsn")
|
||||
@@ -880,6 +889,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
!update_catalog,
|
||||
)?;
|
||||
}
|
||||
"start" => {
|
||||
@@ -918,6 +928,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.get(endpoint_id.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
|
||||
|
||||
let create_test_user = sub_args
|
||||
.get_one::<bool>("create-test-user")
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
|
||||
cplane.check_conflicting_endpoints(
|
||||
endpoint.mode,
|
||||
endpoint.tenant_id,
|
||||
@@ -972,6 +987,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
pageservers,
|
||||
remote_ext_config,
|
||||
stripe_size.0 as usize,
|
||||
create_test_user,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1457,6 +1473,18 @@ fn cli() -> Command {
|
||||
.required(false)
|
||||
.default_value("1");
|
||||
|
||||
let update_catalog = Arg::new("update-catalog")
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("update-catalog")
|
||||
.help("If set, will set up the catalog for neon_superuser")
|
||||
.required(false);
|
||||
|
||||
let create_test_user = Arg::new("create-test-user")
|
||||
.value_parser(value_parser!(bool))
|
||||
.long("create-test-user")
|
||||
.help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1517,6 +1545,7 @@ fn cli() -> Command {
|
||||
.arg(Arg::new("end-lsn").long("end-lsn")
|
||||
.help("Lsn the basebackup ends at"))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(update_catalog.clone())
|
||||
)
|
||||
).subcommand(
|
||||
Command::new("tenant")
|
||||
@@ -1630,6 +1659,7 @@ fn cli() -> Command {
|
||||
.required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
.arg(hot_standby_arg.clone())
|
||||
.arg(update_catalog)
|
||||
)
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
@@ -1637,6 +1667,7 @@ fn cli() -> Command {
|
||||
.arg(endpoint_pageserver_id_arg.clone())
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
.arg(create_test_user)
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
|
||||
@@ -41,11 +41,15 @@ use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use compute_api::spec::Database;
|
||||
use compute_api::spec::PgIdent;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use compute_api::spec::Role;
|
||||
use nix::sys::signal::kill;
|
||||
use nix::sys::signal::Signal;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -122,6 +126,7 @@ impl ComputeControlPlane {
|
||||
http_port: Option<u16>,
|
||||
pg_version: u32,
|
||||
mode: ComputeMode,
|
||||
skip_pg_catalog_updates: bool,
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
@@ -140,7 +145,7 @@ impl ComputeControlPlane {
|
||||
// before and after start are the same. So, skip catalog updates,
|
||||
// with this we basically test a case of waking up an idle compute, where
|
||||
// we also skip catalog updates in the cloud.
|
||||
skip_pg_catalog_updates: true,
|
||||
skip_pg_catalog_updates,
|
||||
features: vec![],
|
||||
});
|
||||
|
||||
@@ -155,7 +160,7 @@ impl ComputeControlPlane {
|
||||
http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: true,
|
||||
skip_pg_catalog_updates,
|
||||
features: vec![],
|
||||
})?,
|
||||
)?;
|
||||
@@ -500,6 +505,7 @@ impl Endpoint {
|
||||
pageservers: Vec<(Host, u16)>,
|
||||
remote_ext_config: Option<&String>,
|
||||
shard_stripe_size: usize,
|
||||
create_test_user: bool,
|
||||
) -> Result<()> {
|
||||
if self.status() == EndpointStatus::Running {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
@@ -551,8 +557,26 @@ impl Endpoint {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
state: None,
|
||||
roles: vec![],
|
||||
databases: vec![],
|
||||
roles: if create_test_user {
|
||||
vec![Role {
|
||||
name: PgIdent::from_str("test").unwrap(),
|
||||
encrypted_password: None,
|
||||
options: None,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
databases: if create_test_user {
|
||||
vec![Database {
|
||||
name: PgIdent::from_str("neondb").unwrap(),
|
||||
owner: PgIdent::from_str("test").unwrap(),
|
||||
options: None,
|
||||
restrict_conn: false,
|
||||
invalid: false,
|
||||
}]
|
||||
} else {
|
||||
Vec::new()
|
||||
},
|
||||
settings: None,
|
||||
postgresql_conf: Some(postgresql_conf),
|
||||
},
|
||||
@@ -566,6 +590,7 @@ impl Endpoint {
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
primary_is_running: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -577,11 +602,16 @@ impl Endpoint {
|
||||
.open(self.endpoint_path().join("compute.log"))?;
|
||||
|
||||
// Launch compute_ctl
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
let conn_str = self.connstr("cloud_admin", "postgres");
|
||||
println!("Starting postgres node at '{}'", conn_str);
|
||||
if create_test_user {
|
||||
let conn_str = self.connstr("user", "neondb");
|
||||
println!("Also at '{}'", conn_str);
|
||||
}
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
cmd.args(["--http-port", &self.http_address.port().to_string()])
|
||||
.args(["--pgdata", self.pgdata().to_str().unwrap()])
|
||||
.args(["--connstr", &self.connstr()])
|
||||
.args(["--connstr", &conn_str])
|
||||
.args([
|
||||
"--spec-path",
|
||||
self.endpoint_path().join("spec.json").to_str().unwrap(),
|
||||
@@ -652,7 +682,9 @@ impl Endpoint {
|
||||
}
|
||||
ComputeStatus::Empty
|
||||
| ComputeStatus::ConfigurationPending
|
||||
| ComputeStatus::Configuration => {
|
||||
| ComputeStatus::Configuration
|
||||
| ComputeStatus::TerminationPending
|
||||
| ComputeStatus::Terminated => {
|
||||
bail!("unexpected compute status: {:?}", state.status)
|
||||
}
|
||||
}
|
||||
@@ -783,13 +815,13 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
pub fn connstr(&self, user: &str, db_name: &str) -> String {
|
||||
format!(
|
||||
"postgresql://{}@{}:{}/{}",
|
||||
"cloud_admin",
|
||||
user,
|
||||
self.pg_address.ip(),
|
||||
self.pg_address.port(),
|
||||
"postgres"
|
||||
db_name
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,6 +210,25 @@ impl PageServerNode {
|
||||
update_config: bool,
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
|
||||
// successfully call /re-attach and finish starting up.
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -248,23 +267,6 @@ impl PageServerNode {
|
||||
)
|
||||
.await?;
|
||||
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -400,6 +402,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
timeline_get_throttle: settings
|
||||
.remove("timeline_get_throttle")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -505,6 +512,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
timeline_get_throttle: settings
|
||||
.remove("timeline_get_throttle")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
|
||||
// compute will exit soon or is waiting for
|
||||
// control-plane to terminate it.
|
||||
Failed,
|
||||
// Termination requested
|
||||
TerminationPending,
|
||||
// Terminated Postgres
|
||||
Terminated,
|
||||
}
|
||||
|
||||
fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
|
||||
|
||||
@@ -79,6 +79,12 @@ pub struct ComputeSpec {
|
||||
// Stripe size for pageserver sharding, in pages
|
||||
#[serde(default)]
|
||||
pub shard_stripe_size: Option<usize>,
|
||||
|
||||
// When we are starting a new replica in hot standby mode,
|
||||
// we need to know if the primary is running.
|
||||
// This is used to determine if replica should wait for
|
||||
// RUNNING_XACTS from primary or not.
|
||||
pub primary_is_running: Option<bool>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
|
||||
@@ -115,7 +115,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
||||
// performed by the process.
|
||||
// We know the size of the block, so we can determine the I/O bytes out of it.
|
||||
// The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
|
||||
#[allow(clippy::unnecessary_cast)]
|
||||
fn update_rusage_metrics() {
|
||||
let rusage_stats = get_rusage_stats();
|
||||
|
||||
@@ -202,6 +201,11 @@ impl<P: Atomic> GenericCounterPairVec<P> {
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
|
||||
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
|
||||
res[0] = self.inc.remove_label_values(vals);
|
||||
res[1] = self.dec.remove_label_values(vals);
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: Atomic> GenericCounterPair<P> {
|
||||
@@ -248,6 +252,15 @@ impl<P: Atomic> GenericCounterPair<P> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<P: Atomic> Clone for GenericCounterPair<P> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
inc: self.inc.clone(),
|
||||
dec: self.dec.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Guard returned by [`GenericCounterPair::guard`]
|
||||
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
|
||||
|
||||
|
||||
@@ -18,9 +18,11 @@ enum-map.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
itertools.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ use postgres_ffi::BLCKSZ;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::key::Key;
|
||||
use itertools::Itertools;
|
||||
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
@@ -63,9 +64,36 @@ impl KeySpace {
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
|
||||
/// Update the keyspace such that it doesn't contain any range
|
||||
/// that is overlapping with `other`. This can involve splitting or
|
||||
/// removing of existing ranges.
|
||||
/// Merge another keyspace into the current one.
|
||||
/// Note: the keyspaces must not ovelap (enforced via assertions)
|
||||
pub fn merge(&mut self, other: &KeySpace) {
|
||||
let all_ranges = self
|
||||
.ranges
|
||||
.iter()
|
||||
.merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
|
||||
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
let mut prev: Option<&Range<Key>> = None;
|
||||
for range in all_ranges {
|
||||
if let Some(prev) = prev {
|
||||
let overlap =
|
||||
std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
|
||||
assert!(
|
||||
!overlap,
|
||||
"Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
|
||||
prev, range
|
||||
);
|
||||
}
|
||||
|
||||
accum.add_range(range.clone());
|
||||
prev = Some(range);
|
||||
}
|
||||
|
||||
self.ranges = accum.to_keyspace().ranges;
|
||||
}
|
||||
|
||||
/// Remove all keys in `other` from `self`.
|
||||
/// This can involve splitting or removing of existing ranges.
|
||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
|
||||
let (self_start, self_end) = match (self.start(), self.end()) {
|
||||
(Some(start), Some(end)) => (start, end),
|
||||
@@ -220,16 +248,7 @@ impl KeySpaceAccum {
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
if let Some(accum) = self.accum.take() {
|
||||
self.ranges.push(accum);
|
||||
}
|
||||
|
||||
let mut prev_accum = KeySpaceAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
KeySpace {
|
||||
ranges: prev_accum.ranges,
|
||||
}
|
||||
std::mem::take(self).to_keyspace()
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
@@ -279,6 +298,13 @@ impl KeySpaceRandomAccum {
|
||||
}
|
||||
KeySpace { ranges }
|
||||
}
|
||||
|
||||
pub fn consume_keyspace(&mut self) -> KeySpace {
|
||||
let mut prev_accum = KeySpaceRandomAccum::new();
|
||||
std::mem::swap(self, &mut prev_accum);
|
||||
|
||||
prev_accum.to_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn key_range_size(key_range: &Range<Key>) -> u32 {
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
pub mod partitioning;
|
||||
pub mod utilization;
|
||||
|
||||
pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
@@ -180,7 +183,7 @@ pub enum TimelineState {
|
||||
Broken { reason: String, backtrace: String },
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineCreateRequest {
|
||||
pub new_timeline_id: TimelineId,
|
||||
#[serde(default)]
|
||||
@@ -214,14 +217,14 @@ impl ShardParameters {
|
||||
pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.count == ShardCount(0)
|
||||
self.count.is_unsharded()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ShardParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: ShardCount(0),
|
||||
count: ShardCount::new(0),
|
||||
stripe_size: Self::DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
@@ -283,6 +286,15 @@ pub struct TenantConfig {
|
||||
pub gc_feedback: Option<bool>,
|
||||
pub heatmap_period: Option<String>,
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
}
|
||||
|
||||
impl TenantConfig {
|
||||
pub fn merge(&mut self, other: &TenantConfig) {
|
||||
if let Some(max_lsn_wal_lag) = &other.max_lsn_wal_lag {
|
||||
self.max_lsn_wal_lag = Some(*max_lsn_wal_lag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -290,6 +302,7 @@ pub struct TenantConfig {
|
||||
pub enum EvictionPolicy {
|
||||
NoEviction,
|
||||
LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
|
||||
OnlyImitiate(EvictionPolicyLayerAccessThreshold),
|
||||
}
|
||||
|
||||
impl EvictionPolicy {
|
||||
@@ -297,6 +310,7 @@ impl EvictionPolicy {
|
||||
match self {
|
||||
EvictionPolicy::NoEviction => "NoEviction",
|
||||
EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
|
||||
EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -309,10 +323,39 @@ pub struct EvictionPolicyLayerAccessThreshold {
|
||||
pub threshold: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub struct ThrottleConfig {
|
||||
pub task_kinds: Vec<String>, // TaskKind
|
||||
pub initial: usize,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub refill_interval: Duration,
|
||||
pub refill_amount: NonZeroUsize,
|
||||
pub max: usize,
|
||||
pub fair: bool,
|
||||
}
|
||||
|
||||
impl ThrottleConfig {
|
||||
pub fn disabled() -> Self {
|
||||
Self {
|
||||
task_kinds: vec![], // effectively disables the throttle
|
||||
// other values don't matter with emtpy `task_kinds`.
|
||||
initial: 0,
|
||||
refill_interval: Duration::from_millis(1),
|
||||
refill_amount: NonZeroUsize::new(1).unwrap(),
|
||||
max: 1,
|
||||
fair: true,
|
||||
}
|
||||
}
|
||||
/// The requests per second allowed by the given config.
|
||||
pub fn steady_rps(&self) -> f64 {
|
||||
(self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
|
||||
/// lists out all possible states (and the virtual "Detached" state)
|
||||
/// in a flat form rather than using rust-style enums.
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum LocationConfigMode {
|
||||
AttachedSingle,
|
||||
AttachedMulti,
|
||||
@@ -376,6 +419,12 @@ pub struct TenantLocationConfigRequest {
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantTimeTravelRequest {
|
||||
pub shard_counts: Vec<ShardCount>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantShardLocation {
|
||||
|
||||
70
libs/pageserver_api/src/models/utilization.rs
Normal file
70
libs/pageserver_api/src/models/utilization.rs
Normal file
@@ -0,0 +1,70 @@
|
||||
use std::time::SystemTime;
|
||||
|
||||
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
|
||||
/// the next tenant.
|
||||
///
|
||||
/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
|
||||
///
|
||||
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
||||
/// not handle full u64 values properly.
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
pub struct PageserverUtilization {
|
||||
/// Used disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub disk_usage_bytes: u64,
|
||||
/// Free disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub free_space_bytes: u64,
|
||||
/// Lower is better score for how good candidate for a next tenant would this pageserver be.
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
pub utilization_score: u64,
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
/// Use millis to give confidence that the value is regenerated often enough.
|
||||
#[serde(serialize_with = "ser_rfc3339_millis")]
|
||||
pub captured_at: SystemTime,
|
||||
}
|
||||
|
||||
fn ser_rfc3339_millis<S: serde::Serializer>(
|
||||
ts: &SystemTime,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||
}
|
||||
|
||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||
///
|
||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||
/// with the highest bit set which is properly parsed by serde formats, but would create a
|
||||
/// conundrum on how to handle and again serialize such values at type level. It will be a few
|
||||
/// years until we can use more than `i64::MAX` bytes on a disk.
|
||||
fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
|
||||
|
||||
let value = (*value).min(MAX_FORMAT_INT64);
|
||||
|
||||
serializer.serialize_u64(value)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn u64_max_is_serialized_as_u63_max() {
|
||||
let doc = PageserverUtilization {
|
||||
disk_usage_bytes: u64::MAX,
|
||||
free_space_bytes: 0,
|
||||
utilization_score: u64::MAX,
|
||||
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
|
||||
};
|
||||
|
||||
let s = serde_json::to_string(&doc).unwrap();
|
||||
|
||||
let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
|
||||
|
||||
assert_eq!(s, expected);
|
||||
}
|
||||
}
|
||||
@@ -13,10 +13,41 @@ use utils::id::TenantId;
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(pub u8);
|
||||
pub struct ShardCount(u8);
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
/// as `TenantShardId::unsharded`.
|
||||
///
|
||||
/// This method returns the actual number of shards, i.e. if our internal value is
|
||||
/// zero, we return 1 (unsharded tenants have 1 shard).
|
||||
pub fn count(&self) -> u8 {
|
||||
if self.0 > 0 {
|
||||
self.0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
|
||||
/// The literal internal value: this is **not** the number of shards in the
|
||||
/// tenant, as we have a special zero value for legacy unsharded tenants. Use
|
||||
/// [`Self::count`] if you want to know the cardinality of shards.
|
||||
pub fn literal(&self) -> u8 {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
|
||||
/// `v` may be zero, or the number of shards in the tenant. `v` is what
|
||||
/// [`Self::literal`] would return.
|
||||
pub fn new(val: u8) -> Self {
|
||||
Self(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl ShardNumber {
|
||||
@@ -86,7 +117,7 @@ impl TenantShardId {
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||
}
|
||||
|
||||
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
||||
@@ -471,10 +502,12 @@ impl ShardIdentity {
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
// A: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
// A1: because the WAL ingestion logic currently ingests some shard 0
|
||||
// content on all shards, even though it's only read on shard 0. If we
|
||||
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||
// an error.
|
||||
// A2: because key_is_shard0 also covers relation size keys, which are written
|
||||
// on all shards even though they're only maintained accurately on shard 0.
|
||||
false
|
||||
} else {
|
||||
!self.is_key_local(key)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#![allow(non_snake_case)]
|
||||
// bindgen creates some unsafe code with no doc comments.
|
||||
#![allow(clippy::missing_safety_doc)]
|
||||
// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
|
||||
// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code.
|
||||
#![allow(clippy::useless_transmute)]
|
||||
// modules included with the postgres_ffi macro depend on the types of the specific version's
|
||||
// types, and trigger a too eager lint.
|
||||
|
||||
@@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
|
||||
pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
|
||||
|
||||
// From standbydefs.h
|
||||
pub const XLOG_RUNNING_XACTS: u8 = 0x10;
|
||||
|
||||
// From srlu.h
|
||||
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
|
||||
pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
|
||||
|
||||
@@ -119,11 +119,6 @@ pub fn generate_pg_control(
|
||||
// Generate new pg_control needed for bootstrap
|
||||
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
//TODO Check this.
|
||||
//We may need to determine the value from twophase data.
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = 0;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
|
||||
@@ -15,11 +15,13 @@ aws-sdk-s3.workspace = true
|
||||
aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
tokio-stream.workspace = true
|
||||
tokio-util = { workspace = true, features = ["compat"] }
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -22,16 +22,15 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use futures_util::TryStreamExt;
|
||||
use http_types::{StatusCode, Url};
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
use crate::TimeTravelError;
|
||||
use crate::{
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
|
||||
RemoteStorage, StorageMetadata,
|
||||
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
|
||||
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
TimeTravelError, TimeoutOrCancel,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
@@ -39,10 +38,12 @@ pub struct AzureBlobStorage {
|
||||
prefix_in_container: Option<String>,
|
||||
max_keys_per_list_response: Option<NonZeroU32>,
|
||||
concurrency_limiter: ConcurrencyLimiter,
|
||||
// Per-request timeout. Accessible for tests.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
impl AzureBlobStorage {
|
||||
pub fn new(azure_config: &AzureConfig) -> Result<Self> {
|
||||
pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
|
||||
debug!(
|
||||
"Creating azure remote storage for azure container {}",
|
||||
azure_config.container_name
|
||||
@@ -79,6 +80,7 @@ impl AzureBlobStorage {
|
||||
prefix_in_container: azure_config.prefix_in_container.to_owned(),
|
||||
max_keys_per_list_response,
|
||||
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
|
||||
timeout,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -121,8 +123,11 @@ impl AzureBlobStorage {
|
||||
async fn download_for_builder(
|
||||
&self,
|
||||
builder: GetBlobBuilder,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let mut response = builder.into_stream();
|
||||
let kind = RequestKind::Get;
|
||||
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let mut etag = None;
|
||||
let mut last_modified = None;
|
||||
@@ -130,39 +135,70 @@ impl AzureBlobStorage {
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
let download = async {
|
||||
let response = builder
|
||||
// convert to concrete Pageable
|
||||
.into_stream()
|
||||
// convert to TryStream
|
||||
.into_stream()
|
||||
.map_err(to_download_error);
|
||||
|
||||
// apply per request timeout
|
||||
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
|
||||
|
||||
// flatten
|
||||
let response = response.map(|res| match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||
});
|
||||
|
||||
let mut response = std::pin::pin!(response);
|
||||
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
}
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
last_modified,
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
bufs = download => bufs,
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
last_modified,
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
self.concurrency_limiter
|
||||
.acquire(kind)
|
||||
.await
|
||||
.expect("semaphore is never closed")
|
||||
async fn permit(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
|
||||
let acquire = self.concurrency_limiter.acquire(kind);
|
||||
|
||||
tokio::select! {
|
||||
permit = acquire => Ok(permit.expect("never closed")),
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -192,66 +228,87 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
let _permit = self.permit(RequestKind::List, cancel).await?;
|
||||
|
||||
let op = async {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
if let Some(prefix) = list_prefix {
|
||||
builder = builder.prefix(Cow::from(prefix.to_owned()));
|
||||
}
|
||||
|
||||
if let Some(limit) = self.max_keys_per_list_response {
|
||||
builder = builder.max_results(MaxResults::new(limit));
|
||||
}
|
||||
|
||||
let response = builder.into_stream();
|
||||
let response = response.into_stream().map_err(to_download_error);
|
||||
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
|
||||
let response = response.map(|res| match res {
|
||||
Ok(res) => res,
|
||||
Err(_elapsed) => Err(DownloadError::Timeout),
|
||||
});
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
let mut response = std::pin::pin!(response);
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
let mut res = Listing::default();
|
||||
|
||||
if let Some(prefix) = list_prefix {
|
||||
builder = builder.prefix(Cow::from(prefix.to_owned()));
|
||||
}
|
||||
let mut max_keys = max_keys.map(|mk| mk.get());
|
||||
while let Some(entry) = response.next().await {
|
||||
let entry = entry?;
|
||||
let prefix_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
.map(|prefix| self.name_to_relative_path(&prefix.name));
|
||||
res.prefixes.extend(prefix_iter);
|
||||
|
||||
if let Some(limit) = self.max_keys_per_list_response {
|
||||
builder = builder.max_results(MaxResults::new(limit));
|
||||
}
|
||||
let blob_iter = entry
|
||||
.blobs
|
||||
.blobs()
|
||||
.map(|k| self.name_to_relative_path(&k.name));
|
||||
|
||||
let mut response = builder.into_stream();
|
||||
let mut res = Listing::default();
|
||||
// NonZeroU32 doesn't support subtraction apparently
|
||||
let mut max_keys = max_keys.map(|mk| mk.get());
|
||||
while let Some(l) = response.next().await {
|
||||
let entry = l.map_err(to_download_error)?;
|
||||
let prefix_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
.map(|prefix| self.name_to_relative_path(&prefix.name));
|
||||
res.prefixes.extend(prefix_iter);
|
||||
for key in blob_iter {
|
||||
res.keys.push(key);
|
||||
|
||||
let blob_iter = entry
|
||||
.blobs
|
||||
.blobs()
|
||||
.map(|k| self.name_to_relative_path(&k.name));
|
||||
|
||||
for key in blob_iter {
|
||||
res.keys.push(key);
|
||||
if let Some(mut mk) = max_keys {
|
||||
assert!(mk > 0);
|
||||
mk -= 1;
|
||||
if mk == 0 {
|
||||
return Ok(res); // limit reached
|
||||
if let Some(mut mk) = max_keys {
|
||||
assert!(mk > 0);
|
||||
mk -= 1;
|
||||
if mk == 0 {
|
||||
return Ok(res); // limit reached
|
||||
}
|
||||
max_keys = Some(mk);
|
||||
}
|
||||
max_keys = Some(mk);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(res)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
@@ -260,35 +317,52 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Put).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
let _permit = self.permit(RequestKind::Put, cancel).await?;
|
||||
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
let op = async {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
|
||||
if let Some(metadata) = metadata {
|
||||
builder = builder.metadata(to_azure_metadata(metadata));
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
|
||||
if let Some(metadata) = metadata {
|
||||
builder = builder.metadata(to_azure_metadata(metadata));
|
||||
}
|
||||
|
||||
let fut = builder.into_future();
|
||||
let fut = tokio::time::timeout(self.timeout, fut);
|
||||
|
||||
match fut.await {
|
||||
Ok(Ok(_response)) => Ok(()),
|
||||
Ok(Err(azure)) => Err(azure.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
|
||||
let _response = builder.into_future().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
let _permit = self.permit(RequestKind::Get).await;
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||
|
||||
let builder = blob_client.get();
|
||||
|
||||
self.download_for_builder(builder).await
|
||||
self.download_for_builder(builder, cancel).await
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -296,8 +370,8 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let _permit = self.permit(RequestKind::Get).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||
|
||||
let mut builder = blob_client.get();
|
||||
@@ -309,82 +383,113 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
};
|
||||
builder = builder.range(range);
|
||||
|
||||
self.download_for_builder(builder).await
|
||||
self.download_for_builder(builder, cancel).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Delete).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
self.delete_objects(std::array::from_ref(path), cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
let builder = blob_client.delete();
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Delete, cancel).await?;
|
||||
|
||||
match builder.into_future().await {
|
||||
Ok(_response) => Ok(()),
|
||||
Err(e) => {
|
||||
if let Some(http_err) = e.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
return Ok(());
|
||||
let op = async {
|
||||
// TODO batch requests are also not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
|
||||
for path in paths {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
|
||||
let request = blob_client.delete().into_future();
|
||||
|
||||
let res = tokio::time::timeout(self.timeout, request).await;
|
||||
|
||||
match res {
|
||||
Ok(Ok(_response)) => continue,
|
||||
Ok(Err(e)) => {
|
||||
if let Some(http_err) = e.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return Err(e.into());
|
||||
}
|
||||
Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
Err(anyhow::Error::new(e))
|
||||
}
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
// Permit is already obtained by inner delete function
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Copy, cancel).await?;
|
||||
|
||||
// TODO batch requests are also not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
|
||||
for path in paths {
|
||||
self.delete(path).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
let timeout = tokio::time::sleep(self.timeout);
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Copy).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
let mut copy_status = None;
|
||||
|
||||
let source_url = format!(
|
||||
"{}/{}",
|
||||
self.client.url()?,
|
||||
self.relative_path_to_name(from)
|
||||
);
|
||||
let builder = blob_client.copy(Url::from_str(&source_url)?);
|
||||
let op = async {
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
let result = builder.into_future().await?;
|
||||
let source_url = format!(
|
||||
"{}/{}",
|
||||
self.client.url()?,
|
||||
self.relative_path_to_name(from)
|
||||
);
|
||||
|
||||
let mut copy_status = result.copy_status;
|
||||
let start_time = Instant::now();
|
||||
const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
|
||||
loop {
|
||||
match copy_status {
|
||||
CopyStatus::Aborted => {
|
||||
anyhow::bail!("Received abort for copy from {from} to {to}.");
|
||||
let builder = blob_client.copy(Url::from_str(&source_url)?);
|
||||
let copy = builder.into_future();
|
||||
|
||||
let result = copy.await?;
|
||||
|
||||
copy_status = Some(result.copy_status);
|
||||
loop {
|
||||
match copy_status.as_ref().expect("we always set it to Some") {
|
||||
CopyStatus::Aborted => {
|
||||
anyhow::bail!("Received abort for copy from {from} to {to}.");
|
||||
}
|
||||
CopyStatus::Failed => {
|
||||
anyhow::bail!("Received failure response for copy from {from} to {to}.");
|
||||
}
|
||||
CopyStatus::Success => return Ok(()),
|
||||
CopyStatus::Pending => (),
|
||||
}
|
||||
CopyStatus::Failed => {
|
||||
anyhow::bail!("Received failure response for copy from {from} to {to}.");
|
||||
}
|
||||
CopyStatus::Success => return Ok(()),
|
||||
CopyStatus::Pending => (),
|
||||
// The copy is taking longer. Waiting a second and then re-trying.
|
||||
// TODO estimate time based on copy_progress and adjust time based on that
|
||||
tokio::time::sleep(Duration::from_millis(1000)).await;
|
||||
let properties = blob_client.get_properties().into_future().await?;
|
||||
let Some(status) = properties.blob.properties.copy_status else {
|
||||
tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
|
||||
return Ok(());
|
||||
};
|
||||
copy_status = Some(status);
|
||||
}
|
||||
// The copy is taking longer. Waiting a second and then re-trying.
|
||||
// TODO estimate time based on copy_progress and adjust time based on that
|
||||
tokio::time::sleep(Duration::from_millis(1000)).await;
|
||||
let properties = blob_client.get_properties().into_future().await?;
|
||||
let Some(status) = properties.blob.properties.copy_status else {
|
||||
tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
|
||||
return Ok(());
|
||||
};
|
||||
if start_time.elapsed() > MAX_WAIT_TIME {
|
||||
anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
|
||||
MAX_WAIT_TIME.as_secs_f32(),
|
||||
properties.blob.properties.copy_progress,
|
||||
);
|
||||
}
|
||||
copy_status = status;
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
_ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
|
||||
_ = timeout => {
|
||||
let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
|
||||
let e = e.context(format!("Timeout, last status: {copy_status:?}"));
|
||||
Err(e)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
200
libs/remote_storage/src/error.rs
Normal file
200
libs/remote_storage/src/error.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
/// Reasons for downloads or listings to fail.
|
||||
#[derive(Debug)]
|
||||
pub enum DownloadError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// A cancellation token aborted the download, typically during
|
||||
/// tenant detach or process shutdown.
|
||||
Cancelled,
|
||||
/// A timeout happened while executing the request. Possible reasons:
|
||||
/// - stuck tcp connection
|
||||
///
|
||||
/// Concurrency control is not timed within timeout.
|
||||
Timeout,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DownloadError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DownloadError::BadInput(e) => {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
DownloadError::Timeout => write!(f, "timeout"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for DownloadError {}
|
||||
|
||||
impl DownloadError {
|
||||
/// Returns true if the error should not be retried with backoff
|
||||
pub fn is_permanent(&self) -> bool {
|
||||
use DownloadError::*;
|
||||
match self {
|
||||
BadInput(_) | NotFound | Cancelled => true,
|
||||
Timeout | Other(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for DownloadError {
|
||||
fn from(value: std::io::Error) -> Self {
|
||||
let needs_unwrap = value.kind() == std::io::ErrorKind::Other
|
||||
&& value
|
||||
.get_ref()
|
||||
.and_then(|x| x.downcast_ref::<DownloadError>())
|
||||
.is_some();
|
||||
|
||||
if needs_unwrap {
|
||||
*value
|
||||
.into_inner()
|
||||
.expect("just checked")
|
||||
.downcast::<DownloadError>()
|
||||
.expect("just checked")
|
||||
} else {
|
||||
DownloadError::Other(value.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TimeTravelError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The used remote storage does not have time travel recovery implemented
|
||||
Unimplemented,
|
||||
/// The number of versions/deletion markers is above our limit.
|
||||
TooManyVersions,
|
||||
/// A cancellation token aborted the process, typically during
|
||||
/// request closure or process shutdown.
|
||||
Cancelled,
|
||||
/// Other errors
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TimeTravelError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
TimeTravelError::BadInput(e) => {
|
||||
write!(
|
||||
f,
|
||||
"Failed to time travel recover a prefix due to user input: {e}"
|
||||
)
|
||||
}
|
||||
TimeTravelError::Unimplemented => write!(
|
||||
f,
|
||||
"time travel recovery is not implemented for the current storage backend"
|
||||
),
|
||||
TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
TimeTravelError::TooManyVersions => {
|
||||
write!(f, "Number of versions/delete markers above limit")
|
||||
}
|
||||
TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TimeTravelError {}
|
||||
|
||||
/// Plain cancelled error.
|
||||
///
|
||||
/// By design this type does not not implement `std::error::Error` so it cannot be put as the root
|
||||
/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this
|
||||
/// crate.
|
||||
///
|
||||
/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning
|
||||
/// operations and ensuring that those get converted to proper versions with just `?`.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Cancelled;
|
||||
|
||||
impl From<Cancelled> for anyhow::Error {
|
||||
fn from(_: Cancelled) -> Self {
|
||||
anyhow::Error::new(TimeoutOrCancel::Cancel)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Cancelled> for TimeTravelError {
|
||||
fn from(_: Cancelled) -> Self {
|
||||
TimeTravelError::Cancelled
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Cancelled> for TimeoutOrCancel {
|
||||
fn from(_: Cancelled) -> Self {
|
||||
TimeoutOrCancel::Cancel
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Cancelled> for DownloadError {
|
||||
fn from(_: Cancelled) -> Self {
|
||||
DownloadError::Cancelled
|
||||
}
|
||||
}
|
||||
|
||||
/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning
|
||||
/// RemoteStorage methods.
|
||||
///
|
||||
/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is
|
||||
/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors.
|
||||
#[derive(Debug)]
|
||||
pub enum TimeoutOrCancel {
|
||||
Timeout,
|
||||
Cancel,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TimeoutOrCancel {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
use TimeoutOrCancel::*;
|
||||
match self {
|
||||
Timeout => write!(f, "timeout"),
|
||||
Cancel => write!(f, "cancel"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TimeoutOrCancel {}
|
||||
|
||||
impl TimeoutOrCancel {
|
||||
/// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
|
||||
pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
|
||||
error
|
||||
.root_cause()
|
||||
.downcast_ref::<Self>()
|
||||
.is_some_and(Self::is_cancel)
|
||||
}
|
||||
|
||||
pub fn is_cancel(&self) -> bool {
|
||||
matches!(self, TimeoutOrCancel::Cancel)
|
||||
}
|
||||
|
||||
pub fn is_timeout(&self) -> bool {
|
||||
matches!(self, TimeoutOrCancel::Timeout)
|
||||
}
|
||||
}
|
||||
|
||||
/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or
|
||||
/// timeout to wrap it in an `std::io::Error`.
|
||||
impl From<TimeoutOrCancel> for std::io::Error {
|
||||
fn from(value: TimeoutOrCancel) -> Self {
|
||||
let e = DownloadError::from(value);
|
||||
std::io::Error::other(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TimeoutOrCancel> for DownloadError {
|
||||
fn from(value: TimeoutOrCancel) -> Self {
|
||||
use TimeoutOrCancel::*;
|
||||
|
||||
match value {
|
||||
Timeout => DownloadError::Timeout,
|
||||
Cancel => DownloadError::Cancelled,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
mod azure_blob;
|
||||
mod error;
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
@@ -21,7 +22,7 @@ use std::{
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
time::SystemTime,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
@@ -41,6 +42,8 @@ pub use self::{
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||
@@ -158,9 +161,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::WithDelimiter, None)
|
||||
.list(prefix, ListingMode::WithDelimiter, None, cancel)
|
||||
.await?
|
||||
.prefixes;
|
||||
Ok(result)
|
||||
@@ -182,9 +186,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::NoDelimiter, max_keys)
|
||||
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
|
||||
.await?
|
||||
.keys;
|
||||
Ok(result)
|
||||
@@ -195,9 +200,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
prefix: Option<&RemotePath>,
|
||||
_mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
/// set to `TimeoutOrCancel`.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
@@ -206,27 +215,61 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Streams the remote storage entry contents.
|
||||
///
|
||||
/// The returned download stream will obey initial timeout and cancellation signal by erroring
|
||||
/// on whichever happens first. Only one of the reasons will fail the stream, which is usually
|
||||
/// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
|
||||
///
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Streams a given byte range of the remote storage entry contents.
|
||||
///
|
||||
/// The returned download stream will obey initial timeout and cancellation signal by erroring
|
||||
/// on whichever happens first. Only one of the reasons will fail the stream, which is usually
|
||||
/// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
|
||||
///
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
/// Delete a single path from remote storage.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
/// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through.
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||
/// Delete a multiple paths from remote storage.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
/// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
|
||||
/// through.
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Copy a remote object inside a bucket from one path to another.
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Resets the content of everything with the given prefix to the given state
|
||||
async fn time_travel_recover(
|
||||
@@ -238,7 +281,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||
/// DownloadStream is sensitive to the timeout and cancellation used with the original
|
||||
/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
|
||||
/// with `tokio::io::copy_buf`.
|
||||
// This has 'static because safekeepers do not use cancellation tokens (yet)
|
||||
pub type DownloadStream =
|
||||
Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>>;
|
||||
|
||||
pub struct Download {
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
@@ -257,86 +306,6 @@ impl Debug for Download {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DownloadError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// A cancellation token aborted the download, typically during
|
||||
/// tenant detach or process shutdown.
|
||||
Cancelled,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DownloadError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DownloadError::BadInput(e) => {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for DownloadError {}
|
||||
|
||||
impl DownloadError {
|
||||
/// Returns true if the error should not be retried with backoff
|
||||
pub fn is_permanent(&self) -> bool {
|
||||
use DownloadError::*;
|
||||
match self {
|
||||
BadInput(_) => true,
|
||||
NotFound => true,
|
||||
Cancelled => true,
|
||||
Other(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum TimeTravelError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The used remote storage does not have time travel recovery implemented
|
||||
Unimplemented,
|
||||
/// The number of versions/deletion markers is above our limit.
|
||||
TooManyVersions,
|
||||
/// A cancellation token aborted the process, typically during
|
||||
/// request closure or process shutdown.
|
||||
Cancelled,
|
||||
/// Other errors
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TimeTravelError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
TimeTravelError::BadInput(e) => {
|
||||
write!(
|
||||
f,
|
||||
"Failed to time travel recover a prefix due to user input: {e}"
|
||||
)
|
||||
}
|
||||
TimeTravelError::Unimplemented => write!(
|
||||
f,
|
||||
"time travel recovery is not implemented for the current storage backend"
|
||||
),
|
||||
TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
TimeTravelError::TooManyVersions => {
|
||||
write!(f, "Number of versions/delete markers above limit")
|
||||
}
|
||||
TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TimeTravelError {}
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
#[derive(Clone)]
|
||||
@@ -354,12 +323,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
|
||||
Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
|
||||
Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
|
||||
Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
|
||||
Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -372,12 +342,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder, max_keys).await,
|
||||
Self::AwsS3(s) => s.list_files(folder, max_keys).await,
|
||||
Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
|
||||
Self::Unreliable(s) => s.list_files(folder, max_keys).await,
|
||||
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -387,36 +358,43 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix).await,
|
||||
Self::AzureBlob(s) => s.list_prefixes(prefix).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix).await,
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
|
||||
Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
|
||||
Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
|
||||
Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
pub async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.download(from).await,
|
||||
Self::AwsS3(s) => s.download(from).await,
|
||||
Self::AzureBlob(s) => s.download(from).await,
|
||||
Self::Unreliable(s) => s.download(from).await,
|
||||
Self::LocalFs(s) => s.download(from, cancel).await,
|
||||
Self::AwsS3(s) => s.download(from, cancel).await,
|
||||
Self::AzureBlob(s) => s.download(from, cancel).await,
|
||||
Self::Unreliable(s) => s.download(from, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -425,54 +403,72 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AwsS3(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
/// See [`RemoteStorage::delete`]
|
||||
pub async fn delete(
|
||||
&self,
|
||||
path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete(path).await,
|
||||
Self::AwsS3(s) => s.delete(path).await,
|
||||
Self::AzureBlob(s) => s.delete(path).await,
|
||||
Self::Unreliable(s) => s.delete(path).await,
|
||||
Self::LocalFs(s) => s.delete(path, cancel).await,
|
||||
Self::AwsS3(s) => s.delete(path, cancel).await,
|
||||
Self::AzureBlob(s) => s.delete(path, cancel).await,
|
||||
Self::Unreliable(s) => s.delete(path, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
/// See [`RemoteStorage::delete_objects`]
|
||||
pub async fn delete_objects(
|
||||
&self,
|
||||
paths: &[RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete_objects(paths).await,
|
||||
Self::AwsS3(s) => s.delete_objects(paths).await,
|
||||
Self::AzureBlob(s) => s.delete_objects(paths).await,
|
||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||
Self::LocalFs(s) => s.delete_objects(paths, cancel).await,
|
||||
Self::AwsS3(s) => s.delete_objects(paths, cancel).await,
|
||||
Self::AzureBlob(s) => s.delete_objects(paths, cancel).await,
|
||||
Self::Unreliable(s) => s.delete_objects(paths, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
/// See [`RemoteStorage::copy`]
|
||||
pub async fn copy_object(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.copy(from, to).await,
|
||||
Self::AwsS3(s) => s.copy(from, to).await,
|
||||
Self::AzureBlob(s) => s.copy(from, to).await,
|
||||
Self::Unreliable(s) => s.copy(from, to).await,
|
||||
Self::LocalFs(s) => s.copy(from, to, cancel).await,
|
||||
Self::AwsS3(s) => s.copy(from, to, cancel).await,
|
||||
Self::AzureBlob(s) => s.copy(from, to, cancel).await,
|
||||
Self::Unreliable(s) => s.copy(from, to, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::time_travel_recover`].
|
||||
pub async fn time_travel_recover(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -503,10 +499,11 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
|
||||
let timeout = storage_config.timeout;
|
||||
Ok(match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
info!("Using fs root '{root}' as a remote storage");
|
||||
Self::LocalFs(LocalFs::new(root.clone())?)
|
||||
RemoteStorageKind::LocalFs(path) => {
|
||||
info!("Using fs root '{path}' as a remote storage");
|
||||
Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
// The profile and access key id are only printed here for debugging purposes,
|
||||
@@ -516,12 +513,12 @@ impl GenericRemoteStorage {
|
||||
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
|
||||
}
|
||||
RemoteStorageKind::AzureContainer(azure_config) => {
|
||||
info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
|
||||
azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
|
||||
Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
|
||||
Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -530,18 +527,15 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
|
||||
}
|
||||
|
||||
/// Takes storage object contents and its size and uploads to remote storage,
|
||||
/// mapping `from_path` to the corresponding remote object id in the storage.
|
||||
///
|
||||
/// The storage object does not have to be present on the `from_path`,
|
||||
/// this path is used for the remote object id conversion only.
|
||||
/// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
|
||||
pub async fn upload_storage_object(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.upload(from, from_size_bytes, to, None)
|
||||
self.upload(from, from_size_bytes, to, None, cancel)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
|
||||
@@ -554,10 +548,11 @@ impl GenericRemoteStorage {
|
||||
&self,
|
||||
byte_range: Option<(u64, Option<u64>)>,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
match byte_range {
|
||||
Some((start, end)) => self.download_byte_range(from, start, end).await,
|
||||
None => self.download(from).await,
|
||||
Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
|
||||
None => self.download(from, cancel).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -572,6 +567,9 @@ pub struct StorageMetadata(HashMap<String, String>);
|
||||
pub struct RemoteStorageConfig {
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
/// A common timeout enforced for all requests after concurrency limiter permit has been
|
||||
/// acquired.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
@@ -656,6 +654,8 @@ impl Debug for AzureConfig {
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
|
||||
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let local_path = toml.get("local_path");
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
@@ -685,6 +685,27 @@ impl RemoteStorageConfig {
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?;
|
||||
|
||||
let timeout = toml
|
||||
.get("timeout")
|
||||
.map(|timeout| {
|
||||
timeout
|
||||
.as_str()
|
||||
.ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
|
||||
})
|
||||
.transpose()
|
||||
.and_then(|timeout| {
|
||||
timeout
|
||||
.map(humantime::parse_duration)
|
||||
.transpose()
|
||||
.map_err(anyhow::Error::new)
|
||||
})
|
||||
.context("parse timeout")?
|
||||
.unwrap_or(Self::DEFAULT_TIMEOUT);
|
||||
|
||||
if timeout < Duration::from_secs(1) {
|
||||
bail!("timeout was specified as {timeout:?} which is too low");
|
||||
}
|
||||
|
||||
let storage = match (
|
||||
local_path,
|
||||
bucket_name,
|
||||
@@ -746,7 +767,7 @@ impl RemoteStorageConfig {
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(RemoteStorageConfig { storage }))
|
||||
Ok(Some(RemoteStorageConfig { storage, timeout }))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -842,4 +863,24 @@ mod tests {
|
||||
let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
|
||||
assert_eq!(err.to_string(), "Path \"/\" is not relative");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_localfs_config_with_timeout() {
|
||||
let input = "local_path = '.'
|
||||
timeout = '5s'";
|
||||
|
||||
let toml = input.parse::<toml_edit::Document>().unwrap();
|
||||
|
||||
let config = RemoteStorageConfig::from_toml(toml.as_item())
|
||||
.unwrap()
|
||||
.expect("it exists");
|
||||
|
||||
assert_eq!(
|
||||
config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
|
||||
timeout: Duration::from_secs(5)
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,12 @@
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
io::ErrorKind,
|
||||
num::NonZeroU32,
|
||||
pin::Pin,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
@@ -20,7 +25,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -29,12 +36,13 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LocalFs {
|
||||
storage_root: Utf8PathBuf,
|
||||
timeout: Duration,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Attempts to create local FS storage, along with its root directory.
|
||||
/// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
|
||||
pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
|
||||
pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result<Self> {
|
||||
if !storage_root.exists() {
|
||||
std::fs::create_dir_all(&storage_root).with_context(|| {
|
||||
format!("Failed to create all directories in the given root path {storage_root:?}")
|
||||
@@ -46,7 +54,10 @@ impl LocalFs {
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(Self { storage_root })
|
||||
Ok(Self {
|
||||
storage_root,
|
||||
timeout,
|
||||
})
|
||||
}
|
||||
|
||||
// mirrors S3Bucket::s3_object_to_relative_path
|
||||
@@ -157,80 +168,14 @@ impl LocalFs {
|
||||
|
||||
Ok(files)
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let mut result = Listing::default();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
async fn upload0(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = to.with_base(&self.storage_root);
|
||||
create_target_directory(&target_file_path).await?;
|
||||
@@ -265,9 +210,26 @@ impl RemoteStorage for LocalFs {
|
||||
let mut buffer_to_read = data.take(from_size_bytes);
|
||||
|
||||
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
|
||||
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
let copy = io::copy_buf(&mut buffer_to_read, &mut destination);
|
||||
|
||||
let bytes_read = tokio::select! {
|
||||
biased;
|
||||
_ = cancel.cancelled() => {
|
||||
let file = destination.into_inner();
|
||||
// wait for the inflight operation(s) to complete so that there could be a next
|
||||
// attempt right away and our writes are not directed to their file.
|
||||
file.into_std().await;
|
||||
|
||||
// TODO: leave the temp or not? leaving is probably less racy. enabled truncate at
|
||||
// least.
|
||||
fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?;
|
||||
return Err(TimeoutOrCancel::Cancel.into());
|
||||
}
|
||||
read = copy => read,
|
||||
};
|
||||
|
||||
let bytes_read =
|
||||
bytes_read.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
|
||||
)
|
||||
@@ -299,6 +261,9 @@ impl RemoteStorage for LocalFs {
|
||||
})?;
|
||||
|
||||
if let Some(storage_metadata) = metadata {
|
||||
// FIXME: we must not be using metadata much, since this would forget the old metadata
|
||||
// for new writes? or perhaps metadata is sticky; could consider removing if it's never
|
||||
// used.
|
||||
let storage_metadata_path = storage_metadata_path(&target_file_path);
|
||||
fs::write(
|
||||
&storage_metadata_path,
|
||||
@@ -315,8 +280,131 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let op = async {
|
||||
let mut result = Listing::default();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
};
|
||||
|
||||
let timeout = async {
|
||||
tokio::time::sleep(self.timeout).await;
|
||||
Err(DownloadError::Timeout)
|
||||
};
|
||||
|
||||
let cancelled = async {
|
||||
cancel.cancelled().await;
|
||||
Err(DownloadError::Cancelled)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
res = op => res,
|
||||
res = timeout => res,
|
||||
res = cancelled => res,
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let cancel = cancel.child_token();
|
||||
|
||||
let op = self.upload0(data, data_size_bytes, to, metadata, &cancel);
|
||||
let mut op = std::pin::pin!(op);
|
||||
|
||||
// race the upload0 to the timeout; if it goes over, do a graceful shutdown
|
||||
let (res, timeout) = tokio::select! {
|
||||
res = &mut op => (res, false),
|
||||
_ = tokio::time::sleep(self.timeout) => {
|
||||
cancel.cancel();
|
||||
(op.await, true)
|
||||
}
|
||||
};
|
||||
|
||||
match res {
|
||||
Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => {
|
||||
// we caused this cancel (or they happened simultaneously) -- swap it out to
|
||||
// Timeout
|
||||
Err(TimeoutOrCancel::Timeout.into())
|
||||
}
|
||||
res => res,
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let source = ReaderStream::new(
|
||||
@@ -334,6 +422,10 @@ impl RemoteStorage for LocalFs {
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
@@ -350,6 +442,7 @@ impl RemoteStorage for LocalFs {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
if end_exclusive <= start_inclusive {
|
||||
@@ -391,6 +484,9 @@ impl RemoteStorage for LocalFs {
|
||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||
let source = ReaderStream::new(source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
@@ -402,7 +498,7 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
let file_path = path.with_base(&self.storage_root);
|
||||
match fs::remove_file(&file_path).await {
|
||||
Ok(()) => Ok(()),
|
||||
@@ -414,14 +510,23 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
for path in paths {
|
||||
self.delete(path).await?
|
||||
self.delete(path, cancel).await?
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
_cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let from_path = from.with_base(&self.storage_root);
|
||||
let to_path = to.with_base(&self.storage_root);
|
||||
create_target_directory(&to_path).await?;
|
||||
@@ -435,7 +540,6 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(clippy::diverging_sub_expression)]
|
||||
async fn time_travel_recover(
|
||||
&self,
|
||||
_prefix: Option<&RemotePath>,
|
||||
@@ -529,8 +633,9 @@ mod fs_tests {
|
||||
remote_storage_path: &RemotePath,
|
||||
expected_metadata: Option<&StorageMetadata>,
|
||||
) -> anyhow::Result<String> {
|
||||
let cancel = CancellationToken::new();
|
||||
let download = storage
|
||||
.download(remote_storage_path)
|
||||
.download(remote_storage_path, &cancel)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
||||
ensure!(
|
||||
@@ -545,16 +650,16 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
|
||||
let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?;
|
||||
assert_eq!(
|
||||
storage.list_all().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
|
||||
let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
|
||||
let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?;
|
||||
assert_eq!(
|
||||
list_files_sorted(&storage).await?,
|
||||
vec![target_path_1.clone(), target_path_2.clone()],
|
||||
@@ -566,7 +671,7 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file_negatives() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
let id = RemotePath::new(Utf8Path::new("dummy"))?;
|
||||
let content = Bytes::from_static(b"12345");
|
||||
@@ -575,34 +680,34 @@ mod fs_tests {
|
||||
// Check that you get an error if the size parameter doesn't match the actual
|
||||
// size of the stream.
|
||||
storage
|
||||
.upload(content(), 0, &id, None)
|
||||
.upload(content(), 0, &id, None, &cancel)
|
||||
.await
|
||||
.expect_err("upload with zero size succeeded");
|
||||
storage
|
||||
.upload(content(), 4, &id, None)
|
||||
.upload(content(), 4, &id, None, &cancel)
|
||||
.await
|
||||
.expect_err("upload with too short size succeeded");
|
||||
storage
|
||||
.upload(content(), 6, &id, None)
|
||||
.upload(content(), 6, &id, None, &cancel)
|
||||
.await
|
||||
.expect_err("upload with too large size succeeded");
|
||||
|
||||
// Correct size is 5, this should succeed.
|
||||
storage.upload(content(), 5, &id, None).await?;
|
||||
storage.upload(content(), 5, &id, None, &cancel).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_storage() -> anyhow::Result<LocalFs> {
|
||||
fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> {
|
||||
let storage_root = tempdir()?.path().to_path_buf();
|
||||
LocalFs::new(storage_root)
|
||||
LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new()))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
|
||||
|
||||
let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
|
||||
assert_eq!(
|
||||
@@ -612,7 +717,7 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
let non_existing_path = "somewhere/else";
|
||||
match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
|
||||
match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await {
|
||||
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
|
||||
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
|
||||
}
|
||||
@@ -621,9 +726,9 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_positive() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
|
||||
|
||||
let full_range_download_contents =
|
||||
read_and_check_metadata(&storage, &upload_target, None).await?;
|
||||
@@ -637,7 +742,12 @@ mod fs_tests {
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let first_part_download = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
first_part_download.metadata.is_none(),
|
||||
@@ -655,6 +765,7 @@ mod fs_tests {
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
Some((first_part_local.len() + second_part_local.len()) as u64),
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
@@ -669,7 +780,7 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
let suffix_bytes = storage
|
||||
.download_byte_range(&upload_target, 13, None)
|
||||
.download_byte_range(&upload_target, 13, None, &cancel)
|
||||
.await?
|
||||
.download_stream;
|
||||
let suffix_bytes = aggregate(suffix_bytes).await?;
|
||||
@@ -677,7 +788,7 @@ mod fs_tests {
|
||||
assert_eq!(upload_name, suffix);
|
||||
|
||||
let all_bytes = storage
|
||||
.download_byte_range(&upload_target, 0, None)
|
||||
.download_byte_range(&upload_target, 0, None, &cancel)
|
||||
.await?
|
||||
.download_stream;
|
||||
let all_bytes = aggregate(all_bytes).await?;
|
||||
@@ -689,9 +800,9 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_negative() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
|
||||
|
||||
let start = 1_000_000_000;
|
||||
let end = start + 1;
|
||||
@@ -700,6 +811,7 @@ mod fs_tests {
|
||||
&upload_target,
|
||||
start,
|
||||
Some(end), // exclusive end
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -716,7 +828,7 @@ mod fs_tests {
|
||||
let end = 234;
|
||||
assert!(start > end, "Should test an incorrect range");
|
||||
match storage
|
||||
.download_byte_range(&upload_target, start, Some(end))
|
||||
.download_byte_range(&upload_target, start, Some(end), &cancel)
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||
@@ -733,15 +845,15 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_file() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
storage.delete(&upload_target, &cancel).await?;
|
||||
assert!(storage.list_all().await?.is_empty());
|
||||
|
||||
storage
|
||||
.delete(&upload_target)
|
||||
.delete(&upload_target, &cancel)
|
||||
.await
|
||||
.expect("Should allow deleting non-existing storage files");
|
||||
|
||||
@@ -750,14 +862,14 @@ mod fs_tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn file_with_metadata() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let metadata = StorageMetadata(HashMap::from([
|
||||
("one".to_string(), "1".to_string()),
|
||||
("two".to_string(), "2".to_string()),
|
||||
]));
|
||||
let upload_target =
|
||||
upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
|
||||
upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?;
|
||||
|
||||
let full_range_download_contents =
|
||||
read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
|
||||
@@ -771,7 +883,12 @@ mod fs_tests {
|
||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||
|
||||
let partial_download_with_metadata = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
|
||||
assert_eq!(
|
||||
@@ -792,16 +909,20 @@ mod fs_tests {
|
||||
#[tokio::test]
|
||||
async fn list() -> anyhow::Result<()> {
|
||||
// No delimiter: should recursively list everything
|
||||
let storage = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
||||
|
||||
let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
|
||||
let listing = storage
|
||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||
.await?;
|
||||
assert!(listing.prefixes.is_empty());
|
||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||
|
||||
// Delimiter: should only go one deep
|
||||
let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
|
||||
let listing = storage
|
||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||
.await?;
|
||||
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
@@ -815,6 +936,7 @@ mod fs_tests {
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
@@ -827,10 +949,75 @@ mod fs_tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn overwrite_shorter_file() -> anyhow::Result<()> {
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
let path = RemotePath::new("does/not/matter/file".into())?;
|
||||
|
||||
let body = Bytes::from_static(b"long file contents is long");
|
||||
{
|
||||
let len = body.len();
|
||||
let body =
|
||||
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
|
||||
storage.upload(body, len, &path, None, &cancel).await?;
|
||||
}
|
||||
|
||||
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
|
||||
assert_eq!(body, read);
|
||||
|
||||
let shorter = Bytes::from_static(b"shorter body");
|
||||
{
|
||||
let len = shorter.len();
|
||||
let body =
|
||||
futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone())));
|
||||
storage.upload(body, len, &path, None, &cancel).await?;
|
||||
}
|
||||
|
||||
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
|
||||
assert_eq!(shorter, read);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> {
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
let path = RemotePath::new("does/not/matter/file".into())?;
|
||||
|
||||
let body = Bytes::from_static(b"long file contents is long");
|
||||
{
|
||||
let len = body.len();
|
||||
let body =
|
||||
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
|
||||
let cancel = cancel.child_token();
|
||||
cancel.cancel();
|
||||
let e = storage
|
||||
.upload(body, len, &path, None, &cancel)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert!(TimeoutOrCancel::caused_by_cancel(&e));
|
||||
}
|
||||
|
||||
{
|
||||
let len = body.len();
|
||||
let body =
|
||||
futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
|
||||
storage.upload(body, len, &path, None, &cancel).await?;
|
||||
}
|
||||
|
||||
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
|
||||
assert_eq!(body, read);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let from_path = storage
|
||||
.storage_root
|
||||
@@ -852,7 +1039,9 @@ mod fs_tests {
|
||||
|
||||
let file = tokio_util::io::ReaderStream::new(file);
|
||||
|
||||
storage.upload(file, size, &relative_path, metadata).await?;
|
||||
storage
|
||||
.upload(file, size, &relative_path, metadata, cancel)
|
||||
.await?;
|
||||
Ok(relative_path)
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::{
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{Context, Poll},
|
||||
time::SystemTime,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context as _};
|
||||
@@ -46,9 +46,9 @@ use utils::backoff;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
|
||||
RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
|
||||
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
|
||||
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
@@ -63,6 +63,8 @@ pub struct S3Bucket {
|
||||
prefix_in_bucket: Option<String>,
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
concurrency_limiter: ConcurrencyLimiter,
|
||||
// Per-request timeout. Accessible for tests.
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
struct GetObjectRequest {
|
||||
@@ -72,7 +74,7 @@ struct GetObjectRequest {
|
||||
}
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
|
||||
tracing::debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
@@ -152,6 +154,7 @@ impl S3Bucket {
|
||||
max_keys_per_list_response: aws_config.max_keys_per_list_response,
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
|
||||
timeout,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -185,40 +188,55 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
async fn permit(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
|
||||
let started_at = start_counting_cancelled_wait(kind);
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.acquire(kind)
|
||||
.await
|
||||
.expect("semaphore is never closed");
|
||||
let acquire = self.concurrency_limiter.acquire(kind);
|
||||
|
||||
let permit = tokio::select! {
|
||||
permit = acquire => permit.expect("semaphore is never closed"),
|
||||
_ = cancel.cancelled() => return Err(Cancelled),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
|
||||
permit
|
||||
Ok(permit)
|
||||
}
|
||||
|
||||
async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
|
||||
async fn owned_permit(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
|
||||
let started_at = start_counting_cancelled_wait(kind);
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.acquire_owned(kind)
|
||||
.await
|
||||
.expect("semaphore is never closed");
|
||||
let acquire = self.concurrency_limiter.acquire_owned(kind);
|
||||
|
||||
let permit = tokio::select! {
|
||||
permit = acquire => permit.expect("semaphore is never closed"),
|
||||
_ = cancel.cancelled() => return Err(Cancelled),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
permit
|
||||
Ok(permit)
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
async fn download_object(
|
||||
&self,
|
||||
request: GetObjectRequest,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let kind = RequestKind::Get;
|
||||
let permit = self.owned_permit(kind).await;
|
||||
|
||||
let permit = self.owned_permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
@@ -228,8 +246,13 @@ impl S3Bucket {
|
||||
.bucket(request.bucket)
|
||||
.key(request.key)
|
||||
.set_range(request.range)
|
||||
.send()
|
||||
.await;
|
||||
.send();
|
||||
|
||||
let get_object = tokio::select! {
|
||||
res = get_object => res,
|
||||
_ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
|
||||
_ = cancel.cancelled() => return Err(DownloadError::Cancelled),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
@@ -259,6 +282,10 @@ impl S3Bucket {
|
||||
}
|
||||
};
|
||||
|
||||
// even if we would have no timeout left, continue anyways. the caller can decide to ignore
|
||||
// the errors considering timeouts and cancellation.
|
||||
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
||||
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output.e_tag;
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
@@ -268,6 +295,9 @@ impl S3Bucket {
|
||||
let body = PermitCarrying::new(permit, body);
|
||||
let body = TimedDownload::new(started_at, body);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone());
|
||||
let body = crate::support::DownloadStream::new(cancel_or_timeout, body);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
etag,
|
||||
@@ -278,33 +308,44 @@ impl S3Bucket {
|
||||
|
||||
async fn delete_oids(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
_permit: &tokio::sync::SemaphorePermit<'_>,
|
||||
delete_objects: &[ObjectIdentifier],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let mut cancel = std::pin::pin!(cancel.cancelled());
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
let req = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(
|
||||
Delete::builder()
|
||||
.set_objects(Some(chunk.to_vec()))
|
||||
.build()?,
|
||||
.build()
|
||||
.context("build request")?,
|
||||
)
|
||||
.send()
|
||||
.await;
|
||||
.send();
|
||||
|
||||
let resp = tokio::select! {
|
||||
resp = req => resp,
|
||||
_ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
_ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
let resp = resp?;
|
||||
let resp = resp.context("request deletion")?;
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
@@ -320,9 +361,10 @@ impl S3Bucket {
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to delete {}/{} objects",
|
||||
errors.len(),
|
||||
chunk.len(),
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -410,6 +452,7 @@ impl RemoteStorage for S3Bucket {
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let kind = RequestKind::List;
|
||||
// s3 sdk wants i32
|
||||
@@ -431,10 +474,11 @@ impl RemoteStorage for S3Bucket {
|
||||
p
|
||||
});
|
||||
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let mut continuation_token = None;
|
||||
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
// min of two Options, returning Some if one is value and another is
|
||||
@@ -456,9 +500,15 @@ impl RemoteStorage for S3Bucket {
|
||||
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
let response = request
|
||||
.send()
|
||||
.await
|
||||
let request = request.send();
|
||||
|
||||
let response = tokio::select! {
|
||||
res = request => res,
|
||||
_ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
|
||||
_ = cancel.cancelled() => return Err(DownloadError::Cancelled),
|
||||
};
|
||||
|
||||
let response = response
|
||||
.context("Failed to list S3 prefixes")
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
@@ -511,16 +561,17 @@ impl RemoteStorage for S3Bucket {
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Put;
|
||||
let _guard = self.permit(kind).await;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = Body::wrap_stream(from);
|
||||
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
|
||||
|
||||
let res = self
|
||||
let upload = self
|
||||
.client
|
||||
.put_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
@@ -528,22 +579,40 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_metadata(metadata.map(|m| m.0))
|
||||
.content_length(from_size_bytes.try_into()?)
|
||||
.body(bytes_stream)
|
||||
.send()
|
||||
.await;
|
||||
.send();
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
let upload = tokio::time::timeout(self.timeout, upload);
|
||||
|
||||
res?;
|
||||
let res = tokio::select! {
|
||||
res = upload => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
if let Ok(inner) = &res {
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
|
||||
match res {
|
||||
Ok(Ok(_put)) => Ok(()),
|
||||
Ok(Err(sdk)) => Err(sdk.into()),
|
||||
Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Copy;
|
||||
let _guard = self.permit(kind).await;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let timeout = tokio::time::sleep(self.timeout);
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
@@ -554,14 +623,19 @@ impl RemoteStorage for S3Bucket {
|
||||
self.relative_path_to_s3_object(from)
|
||||
);
|
||||
|
||||
let res = self
|
||||
let op = self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(to))
|
||||
.copy_source(copy_source)
|
||||
.send()
|
||||
.await;
|
||||
.send();
|
||||
|
||||
let res = tokio::select! {
|
||||
res = op => res,
|
||||
_ = timeout => return Err(TimeoutOrCancel::Timeout.into()),
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
@@ -573,14 +647,21 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
// if prefix is not none then download file `prefix/from`
|
||||
// if prefix is none then download file `from`
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range: None,
|
||||
})
|
||||
self.download_object(
|
||||
GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range: None,
|
||||
},
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -589,6 +670,7 @@ impl RemoteStorage for S3Bucket {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
|
||||
// and needs both ends to be exclusive
|
||||
@@ -598,31 +680,39 @@ impl RemoteStorage for S3Bucket {
|
||||
None => format!("bytes={start_inclusive}-"),
|
||||
});
|
||||
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range,
|
||||
})
|
||||
self.download_object(
|
||||
GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range,
|
||||
},
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
let mut delete_objects = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
let obj_id = ObjectIdentifier::builder()
|
||||
.set_key(Some(self.relative_path_to_s3_object(path)))
|
||||
.build()?;
|
||||
.build()
|
||||
.context("convert path to oid")?;
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
self.delete_oids(kind, &delete_objects).await
|
||||
self.delete_oids(&permit, &delete_objects, cancel).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths).await
|
||||
self.delete_objects(paths, cancel).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
@@ -633,7 +723,7 @@ impl RemoteStorage for S3Bucket {
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let _guard = self.permit(kind).await;
|
||||
let permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let timestamp = DateTime::from(timestamp);
|
||||
let done_if_after = DateTime::from(done_if_after);
|
||||
@@ -647,7 +737,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let warn_threshold = 3;
|
||||
let max_retries = 10;
|
||||
let is_permanent = |_e: &_| false;
|
||||
let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
|
||||
|
||||
let mut key_marker = None;
|
||||
let mut version_id_marker = None;
|
||||
@@ -656,15 +746,19 @@ impl RemoteStorage for S3Bucket {
|
||||
loop {
|
||||
let response = backoff::retry(
|
||||
|| async {
|
||||
self.client
|
||||
let op = self
|
||||
.client
|
||||
.list_object_versions()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(prefix.clone())
|
||||
.set_key_marker(key_marker.clone())
|
||||
.set_version_id_marker(version_id_marker.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))
|
||||
.send();
|
||||
|
||||
tokio::select! {
|
||||
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
|
||||
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
|
||||
}
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
@@ -786,14 +880,18 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self.client
|
||||
let op = self
|
||||
.client
|
||||
.copy_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(key)
|
||||
.copy_source(&source_id)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))
|
||||
.send();
|
||||
|
||||
tokio::select! {
|
||||
res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
|
||||
_ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
|
||||
}
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
@@ -824,10 +922,18 @@ impl RemoteStorage for S3Bucket {
|
||||
let oid = ObjectIdentifier::builder()
|
||||
.key(key.to_owned())
|
||||
.build()
|
||||
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
|
||||
self.delete_oids(kind, &[oid])
|
||||
.map_err(|e| TimeTravelError::Other(e.into()))?;
|
||||
|
||||
self.delete_oids(&permit, &[oid], cancel)
|
||||
.await
|
||||
.map_err(TimeTravelError::Other)?;
|
||||
.map_err(|e| {
|
||||
// delete_oid0 will use TimeoutOrCancel
|
||||
if TimeoutOrCancel::caused_by_cancel(&e) {
|
||||
TimeTravelError::Cancelled
|
||||
} else {
|
||||
TimeTravelError::Other(e)
|
||||
}
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -963,7 +1069,8 @@ mod tests {
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: Some(5),
|
||||
};
|
||||
let storage = S3Bucket::new(&config).expect("remote storage init");
|
||||
let storage =
|
||||
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
|
||||
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
|
||||
let result = storage.relative_path_to_s3_object(test_path);
|
||||
let expected = expected_outputs[prefix_idx][test_path_idx];
|
||||
|
||||
@@ -90,11 +90,16 @@ impl UnreliableWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
|
||||
async fn delete_inner(
|
||||
&self,
|
||||
path: &RemotePath,
|
||||
attempt: bool,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
if attempt {
|
||||
self.attempt(RemoteOp::Delete(path.clone()))?;
|
||||
}
|
||||
self.inner.delete(path).await
|
||||
self.inner.delete(path, cancel).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,20 +110,22 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_prefixes(prefix).await
|
||||
self.inner.list_prefixes(prefix, cancel).await
|
||||
}
|
||||
|
||||
async fn list_files(
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_files(folder, max_keys).await
|
||||
self.inner.list_files(folder, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn list(
|
||||
@@ -126,10 +133,11 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list(prefix, mode, max_keys).await
|
||||
self.inner.list(prefix, mode, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
@@ -140,15 +148,22 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.upload(data, data_size_bytes, to, metadata).await
|
||||
self.inner
|
||||
.upload(data, data_size_bytes, to, metadata, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
async fn download(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
self.attempt(RemoteOp::Download(from.clone()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.download(from).await
|
||||
self.inner.download(from, cancel).await
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -156,6 +171,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
// Note: We treat any download_byte_range as an "attempt" of the same
|
||||
// operation. We don't pay attention to the ranges. That's good enough
|
||||
@@ -163,20 +179,24 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.attempt(RemoteOp::Download(from.clone()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner
|
||||
.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
.download_byte_range(from, start_inclusive, end_exclusive, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
self.delete_inner(path, true).await
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
self.delete_inner(path, true, cancel).await
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
async fn delete_objects<'a>(
|
||||
&self,
|
||||
paths: &'a [RemotePath],
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
|
||||
let mut error_counter = 0;
|
||||
for path in paths {
|
||||
// Dont record attempt because it was already recorded above
|
||||
if (self.delete_inner(path, false).await).is_err() {
|
||||
if (self.delete_inner(path, false, cancel).await).is_err() {
|
||||
error_counter += 1;
|
||||
}
|
||||
}
|
||||
@@ -189,11 +209,16 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
to: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
// copy is equivalent to download + upload
|
||||
self.attempt(RemoteOp::Download(from.clone()))?;
|
||||
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||
self.inner.copy_object(from, to).await
|
||||
self.inner.copy_object(from, to, cancel).await
|
||||
}
|
||||
|
||||
async fn time_travel_recover(
|
||||
|
||||
@@ -1,9 +1,15 @@
|
||||
use std::{
|
||||
future::Future,
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures_util::Stream;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::TimeoutOrCancel;
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||
@@ -31,3 +37,139 @@ impl<S: Stream> Stream for PermitCarrying<S> {
|
||||
self.inner.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
pub(crate) struct DownloadStream<F, S> {
|
||||
hit: bool,
|
||||
#[pin]
|
||||
cancellation: F,
|
||||
#[pin]
|
||||
inner: S,
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, S> DownloadStream<F, S> {
|
||||
pub(crate) fn new(cancellation: F, inner: S) -> Self {
|
||||
Self {
|
||||
cancellation,
|
||||
hit: false,
|
||||
inner,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used.
|
||||
impl<E, F, S> Stream for DownloadStream<F, S>
|
||||
where
|
||||
std::io::Error: From<E>,
|
||||
F: Future<Output = E>,
|
||||
S: Stream<Item = std::io::Result<Bytes>>,
|
||||
{
|
||||
type Item = <S as Stream>::Item;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
let this = self.project();
|
||||
|
||||
if !*this.hit {
|
||||
if let Poll::Ready(e) = this.cancellation.poll(cx) {
|
||||
*this.hit = true;
|
||||
|
||||
// most likely this will be a std::io::Error wrapping a DownloadError
|
||||
let e = Err(std::io::Error::from(e));
|
||||
return Poll::Ready(Some(e));
|
||||
}
|
||||
}
|
||||
|
||||
this.inner.poll_next(cx)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.inner.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
/// Fires only on the first cancel or timeout, not on both.
|
||||
pub(crate) async fn cancel_or_timeout(
|
||||
timeout: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> TimeoutOrCancel {
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
|
||||
_ = cancel.cancelled() => TimeoutOrCancel::Cancel,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::DownloadError;
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn cancelled_download_stream() {
|
||||
let inner = futures::stream::pending();
|
||||
let timeout = Duration::from_secs(120);
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
let mut first = stream.next();
|
||||
|
||||
tokio::select! {
|
||||
_ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"),
|
||||
_ = tokio::time::sleep(Duration::from_secs(1)) => {},
|
||||
}
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
let e = first.await.expect("there must be some").unwrap_err();
|
||||
assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
|
||||
let inner = e.get_ref().expect("inner should be set");
|
||||
assert!(
|
||||
inner
|
||||
.downcast_ref::<DownloadError>()
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
let e = DownloadError::from(e);
|
||||
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
|
||||
|
||||
tokio::select! {
|
||||
_ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
|
||||
_ = tokio::time::sleep(Duration::from_secs(121)) => {},
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn timeouted_download_stream() {
|
||||
let inner = futures::stream::pending();
|
||||
let timeout = Duration::from_secs(120);
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
// because the stream uses 120s timeout and we are paused, we advance to 120s right away.
|
||||
let first = stream.next();
|
||||
|
||||
let e = first.await.expect("there must be some").unwrap_err();
|
||||
assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
|
||||
let inner = e.get_ref().expect("inner should be set");
|
||||
assert!(
|
||||
inner
|
||||
.downcast_ref::<DownloadError>()
|
||||
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
|
||||
"{inner:?}"
|
||||
);
|
||||
let e = DownloadError::from(e);
|
||||
assert!(matches!(e, DownloadError::Timeout), "{e:?}");
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
tokio::select! {
|
||||
_ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"),
|
||||
_ = tokio::time::sleep(Duration::from_secs(121)) => {},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{Download, GenericRemoteStorage, RemotePath};
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
@@ -58,8 +59,12 @@ pub(crate) async fn upload_simple_remote_data(
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
let cancel = cancel.clone();
|
||||
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
@@ -69,7 +74,9 @@ pub(crate) async fn upload_simple_remote_data(
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
task_client
|
||||
.upload(data, len, &blob_path, None, &cancel)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
@@ -107,13 +114,15 @@ pub(crate) async fn cleanup(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let cancel = CancellationToken::new();
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
let cancel = cancel.clone();
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.delete(&object_to_delete, &cancel)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
@@ -141,8 +150,12 @@ pub(crate) async fn upload_remote_data(
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} remote files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
let cancel = cancel.clone();
|
||||
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
@@ -152,7 +165,9 @@ pub(crate) async fn upload_remote_data(
|
||||
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
task_client
|
||||
.upload(data, data_len, &blob_path, None, &cancel)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
|
||||
@@ -4,6 +4,7 @@ use remote_storage::RemotePath;
|
||||
use std::sync::Arc;
|
||||
use std::{collections::HashSet, num::NonZeroU32};
|
||||
use test_context::test_context;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::common::{download_to_vec, upload_stream, wrap_stream};
|
||||
@@ -45,13 +46,15 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
}
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.list_prefixes(None, &cancel)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
@@ -62,7 +65,7 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix))
|
||||
.list_prefixes(Some(&base_prefix), &cancel)
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
@@ -99,11 +102,12 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
||||
anyhow::bail!("S3 init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
let cancel = CancellationToken::new();
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None, None)
|
||||
.list_files(None, None, &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.into_iter()
|
||||
@@ -117,13 +121,13 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
||||
// Test that max_keys limit works. In total there are about 21 files (see
|
||||
// upload_simple_remote_data call in test_real_s3.rs).
|
||||
let limited_root_files = test_client
|
||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()))
|
||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?;
|
||||
assert_eq!(limited_root_files.len(), 2);
|
||||
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix), None)
|
||||
.list_files(Some(&base_prefix), None, &cancel)
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.into_iter()
|
||||
@@ -150,12 +154,17 @@ async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Resu
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
ctx.client.delete(&path).await.expect("should succeed");
|
||||
ctx.client
|
||||
.delete(&path, &cancel)
|
||||
.await
|
||||
.expect("should succeed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -168,6 +177,8 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
||||
MaybeEnabledStorage::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
@@ -178,21 +189,21 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None).await?;
|
||||
ctx.client.upload(data, len, &path1, None, &cancel).await?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
ctx.client.upload(data, len, &path2, None, &cancel).await?;
|
||||
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None).await?;
|
||||
ctx.client.upload(data, len, &path3, None, &cancel).await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
||||
|
||||
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
ctx.client.delete_objects(&[path3]).await?;
|
||||
ctx.client.delete_objects(&[path3], &cancel).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -204,6 +215,8 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
@@ -211,47 +224,56 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
|
||||
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
ctx.client.upload(data, len, &path, None, &cancel).await?;
|
||||
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let dl = ctx.client.download(&path, &cancel).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.download_byte_range(&path, 0, Some(len as u64), &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 4, Some(10), &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100), &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 4, None, &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, None, &cancel)
|
||||
.await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete(&path)
|
||||
.delete(&path, &cancel)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
@@ -265,6 +287,8 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
||||
))
|
||||
@@ -278,18 +302,18 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
|
||||
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
ctx.client.upload(data, len, &path, None, &cancel).await?;
|
||||
|
||||
// Normal download request
|
||||
ctx.client.copy_object(&path, &path_dest).await?;
|
||||
ctx.client.copy_object(&path, &path_dest, &cancel).await?;
|
||||
|
||||
let dl = ctx.client.download(&path_dest).await?;
|
||||
let dl = ctx.client.download(&path_dest, &cancel).await?;
|
||||
let buf = download_to_vec(dl).await?;
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete_objects(&[path.clone(), path_dest.clone()])
|
||||
.delete_objects(&[path.clone(), path_dest.clone()], &cancel)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
use std::{collections::HashSet, time::Duration};
|
||||
|
||||
use anyhow::Context;
|
||||
use remote_storage::{
|
||||
@@ -39,6 +39,17 @@ impl EnabledAzure {
|
||||
base_prefix: BASE_PREFIX,
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)] // this will be needed when moving the timeout integration tests back
|
||||
fn configure_request_timeout(&mut self, timeout: Duration) {
|
||||
match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
|
||||
GenericRemoteStorage::AzureBlob(azure) => {
|
||||
let azure = Arc::get_mut(azure).expect("inner Arc::get_mut");
|
||||
azure.timeout = timeout;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledStorage {
|
||||
@@ -213,6 +224,7 @@ fn create_azure_client(
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
timeout: Duration::from_secs(120),
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::env;
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
@@ -9,9 +10,10 @@ use std::{collections::HashSet, time::SystemTime};
|
||||
use crate::common::{download_to_vec, upload_stream};
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures_util::Future;
|
||||
use futures_util::StreamExt;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
@@ -27,7 +29,6 @@ use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_re
|
||||
use utils::backoff;
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
@@ -69,8 +70,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
ret
|
||||
}
|
||||
|
||||
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(retry(|| client.list_files(None, None))
|
||||
async fn list_files(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(retry(|| client.list_files(None, None, cancel))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
@@ -90,11 +94,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None)
|
||||
ctx.client.upload(data, len, &path1, None, &cancel)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t0_files = list_files(&ctx.client).await?;
|
||||
let t0_files = list_files(&ctx.client, &cancel).await?;
|
||||
let t0 = time_point().await;
|
||||
println!("at t0: {t0_files:?}");
|
||||
|
||||
@@ -102,17 +106,17 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(old_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
ctx.client.upload(data, len, &path2, None, &cancel)
|
||||
})
|
||||
.await?;
|
||||
|
||||
let t1_files = list_files(&ctx.client).await?;
|
||||
let t1_files = list_files(&ctx.client, &cancel).await?;
|
||||
let t1 = time_point().await;
|
||||
println!("at t1: {t1_files:?}");
|
||||
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = retry(|| ctx.client.download(&path2)).await?;
|
||||
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
@@ -125,7 +129,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None)
|
||||
ctx.client.upload(data, len, &path3, None, &cancel)
|
||||
})
|
||||
.await?;
|
||||
|
||||
@@ -133,12 +137,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
|
||||
retry(|| {
|
||||
let (data, len) = upload_stream(new_data.as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None)
|
||||
ctx.client.upload(data, len, &path2, None, &cancel)
|
||||
})
|
||||
.await?;
|
||||
|
||||
retry(|| ctx.client.delete(&path1)).await?;
|
||||
let t2_files = list_files(&ctx.client).await?;
|
||||
retry(|| ctx.client.delete(&path1, &cancel)).await?;
|
||||
let t2_files = list_files(&ctx.client, &cancel).await?;
|
||||
let t2 = time_point().await;
|
||||
println!("at t2: {t2_files:?}");
|
||||
|
||||
@@ -147,10 +151,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, &cancel)
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client).await?;
|
||||
let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
assert_eq!(t2_files, t2_files_recovered);
|
||||
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
|
||||
assert_eq!(path2_recovered_t2, new_data.as_bytes());
|
||||
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
@@ -158,10 +162,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, &cancel)
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client).await?;
|
||||
let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
assert_eq!(t1_files, t1_files_recovered);
|
||||
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
|
||||
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
|
||||
assert_eq!(path2_recovered_t1, old_data.as_bytes());
|
||||
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
@@ -169,14 +173,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, &cancel)
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client).await?;
|
||||
let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
assert_eq!(t0_files, t0_files_recovered);
|
||||
|
||||
// cleanup
|
||||
|
||||
let paths = &[path1, path2, path3];
|
||||
retry(|| ctx.client.delete_objects(paths)).await?;
|
||||
retry(|| ctx.client.delete_objects(paths, &cancel)).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -197,6 +201,16 @@ impl EnabledS3 {
|
||||
base_prefix: BASE_PREFIX,
|
||||
}
|
||||
}
|
||||
|
||||
fn configure_request_timeout(&mut self, timeout: Duration) {
|
||||
match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
|
||||
GenericRemoteStorage::AwsS3(s3) => {
|
||||
let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut");
|
||||
s3.timeout = timeout;
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledStorage {
|
||||
@@ -370,8 +384,169 @@ fn create_s3_client(
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
|
||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
||||
return;
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
let timeout = std::time::Duration::from_secs(5);
|
||||
|
||||
ctx.configure_request_timeout(timeout);
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
let mut stream = ctx
|
||||
.client
|
||||
.download(&path, &cancel)
|
||||
.await
|
||||
.expect("download succeeds")
|
||||
.download_stream;
|
||||
|
||||
if started_at.elapsed().mul_f32(0.9) >= timeout {
|
||||
tracing::warn!(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"timeout might be too low, consumed most of it during headers"
|
||||
);
|
||||
}
|
||||
|
||||
let first = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("should have the first blob")
|
||||
.expect("should have succeeded");
|
||||
|
||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
||||
|
||||
assert!(
|
||||
first.len() < len,
|
||||
"uploaded file is too small, we downloaded all on first chunk"
|
||||
);
|
||||
|
||||
tokio::time::sleep(timeout).await;
|
||||
|
||||
{
|
||||
let started_at = std::time::Instant::now();
|
||||
let next = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("stream should not have ended yet");
|
||||
|
||||
tracing::info!(
|
||||
next.is_err = next.is_err(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"received item after timeout"
|
||||
);
|
||||
|
||||
let e = next.expect_err("expected an error, but got a chunk?");
|
||||
|
||||
let inner = e.get_ref().expect("std::io::Error::inner should be set");
|
||||
assert!(
|
||||
inner
|
||||
.downcast_ref::<DownloadError>()
|
||||
.is_some_and(|e| matches!(e, DownloadError::Timeout)),
|
||||
"{inner:?}"
|
||||
);
|
||||
}
|
||||
|
||||
ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT);
|
||||
|
||||
ctx.client.delete_objects(&[path], &cancel).await.unwrap()
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledStorage)]
|
||||
#[tokio::test]
|
||||
async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
|
||||
return;
|
||||
};
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/file_to_copy", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
{
|
||||
let mut stream = ctx
|
||||
.client
|
||||
.download(&path, &cancel)
|
||||
.await
|
||||
.expect("download succeeds")
|
||||
.download_stream;
|
||||
|
||||
let first = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("should have the first blob")
|
||||
.expect("should have succeeded");
|
||||
|
||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
||||
|
||||
assert!(
|
||||
first.len() < len,
|
||||
"uploaded file is too small, we downloaded all on first chunk"
|
||||
);
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
let next = stream.next().await.expect("stream should have more");
|
||||
|
||||
let e = next.expect_err("expected an error, but got a chunk?");
|
||||
|
||||
let inner = e.get_ref().expect("std::io::Error::inner should be set");
|
||||
assert!(
|
||||
inner
|
||||
.downcast_ref::<DownloadError>()
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
ctx.client.delete_objects(&[path], &cancel).await.unwrap();
|
||||
}
|
||||
|
||||
/// Upload a long enough file so that we cannot download it in single chunk
|
||||
///
|
||||
/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin
|
||||
async fn upload_large_enough_file(
|
||||
client: &GenericRemoteStorage,
|
||||
path: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> usize {
|
||||
let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
|
||||
let body = bytes::Bytes::from(vec![0u8; 1024]);
|
||||
let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
|
||||
|
||||
let len = contents.clone().fold(0, |acc, next| acc + next.len());
|
||||
|
||||
let contents = futures::stream::iter(contents.map(std::io::Result::Ok));
|
||||
|
||||
client
|
||||
.upload(contents, len, path, None, cancel)
|
||||
.await
|
||||
.expect("upload succeeds");
|
||||
|
||||
len
|
||||
}
|
||||
|
||||
@@ -25,6 +25,7 @@ hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
leaky-bucket.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use utils::id;
|
||||
|
||||
|
||||
@@ -29,6 +29,9 @@ pub enum Scope {
|
||||
// Should only be used e.g. for status check.
|
||||
// Currently also used for connection from any pageserver to any safekeeper.
|
||||
SafekeeperData,
|
||||
// The scope used by pageservers in upcalls to storage controller and cloud control plane
|
||||
#[serde(rename = "generations_api")]
|
||||
GenerationsApi,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
|
||||
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -35,6 +36,7 @@ humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
leaky-bucket.workspace = true
|
||||
md5.workspace = true
|
||||
nix.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
@@ -82,7 +84,7 @@ workspace_hack.workspace = true
|
||||
reqwest.workspace = true
|
||||
rpds.workspace = true
|
||||
enum-map.workspace = true
|
||||
enumset.workspace = true
|
||||
enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
|
||||
@@ -6,14 +6,28 @@
|
||||
//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
|
||||
//! logging what happens when a sequential scan is requested on a small table, then picking out two
|
||||
//! suitable from logs.
|
||||
//!
|
||||
//!
|
||||
//! Reference data (git blame to see commit) on an i3en.3xlarge
|
||||
// ```text
|
||||
//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs]
|
||||
//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs]
|
||||
//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs]
|
||||
//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs]
|
||||
//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs]
|
||||
//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs]
|
||||
//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs]
|
||||
//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms]
|
||||
//! ``
|
||||
|
||||
use std::sync::{Arc, Barrier};
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver::{
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::task::JoinSet;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
@@ -39,11 +53,11 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
.build()
|
||||
.unwrap();
|
||||
tracing::info!("executing first");
|
||||
short().execute(rt.handle(), &manager).unwrap();
|
||||
rt.block_on(short().execute(&manager)).unwrap();
|
||||
tracing::info!("first executed");
|
||||
}
|
||||
|
||||
let thread_counts = [1, 2, 4, 8, 16];
|
||||
let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
@@ -74,114 +88,69 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
drop(group);
|
||||
}
|
||||
|
||||
/// Sets up `threads` number of requesters to `request_redo`, with the given input.
|
||||
/// Sets up a multi-threaded tokio runtime with default worker thread count,
|
||||
/// then, spawn `requesters` tasks that repeatedly:
|
||||
/// - get input from `input_factor()`
|
||||
/// - call `manager.request_redo()` with their input
|
||||
///
|
||||
/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
|
||||
///
|
||||
/// Using tokio's default worker thread count means the results will differ on machines
|
||||
/// with different core countrs. We don't care about that, the performance will always
|
||||
/// be different on different hardware. To compare performance of different software versions,
|
||||
/// use the same hardware.
|
||||
fn add_multithreaded_walredo_requesters(
|
||||
b: &mut criterion::Bencher,
|
||||
threads: u32,
|
||||
nrequesters: usize,
|
||||
manager: &Arc<PostgresRedoManager>,
|
||||
input_factory: fn() -> Request,
|
||||
) {
|
||||
assert_ne!(threads, 0);
|
||||
assert_ne!(nrequesters, 0);
|
||||
|
||||
if threads == 1 {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
b.iter_batched_ref(
|
||||
|| Some(input_factory()),
|
||||
|input| execute_all(input.take(), handle, manager),
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
} else {
|
||||
let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
|
||||
let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));
|
||||
|
||||
let barrier = Arc::new(Barrier::new(threads as usize + 1));
|
||||
|
||||
let jhs = (0..threads)
|
||||
.map(|_| {
|
||||
std::thread::spawn({
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
let work_rx = work_rx.clone();
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
loop {
|
||||
// queue up and wait if we want to go another round
|
||||
if work_rx.lock().unwrap().recv().is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let input = Some(input_factory());
|
||||
|
||||
barrier.wait();
|
||||
|
||||
execute_all(input, handle, &manager).unwrap();
|
||||
|
||||
barrier.wait();
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let _jhs = JoinOnDrop(jhs);
|
||||
|
||||
b.iter_batched(
|
||||
|| {
|
||||
for _ in 0..threads {
|
||||
work_tx.send(()).unwrap()
|
||||
}
|
||||
},
|
||||
|()| {
|
||||
// start the work
|
||||
barrier.wait();
|
||||
|
||||
// wait for work to complete
|
||||
barrier.wait();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
|
||||
drop(work_tx);
|
||||
let mut requesters = JoinSet::new();
|
||||
for _ in 0..nrequesters {
|
||||
let _entered = rt.enter();
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
requesters.spawn(async move {
|
||||
loop {
|
||||
let input = input_factory();
|
||||
barrier.wait().await;
|
||||
let page = input.execute(&manager).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
barrier.wait().await;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
|
||||
let do_one_iteration = || {
|
||||
rt.block_on(async {
|
||||
barrier.wait().await;
|
||||
// wait for work to complete
|
||||
barrier.wait().await;
|
||||
})
|
||||
};
|
||||
|
||||
impl Drop for JoinOnDrop {
|
||||
// it's not really needless because we want join all then check for panicks
|
||||
#[allow(clippy::needless_collect)]
|
||||
fn drop(&mut self) {
|
||||
// first join all
|
||||
let results = self.0.drain(..).map(|jh| jh.join()).collect::<Vec<_>>();
|
||||
// then check the results; panicking here is not great, but it does get the message across
|
||||
// to the user, and sets an exit value.
|
||||
results.into_iter().try_for_each(|res| res).unwrap();
|
||||
}
|
||||
}
|
||||
b.iter_batched(
|
||||
|| {
|
||||
// warmup
|
||||
do_one_iteration();
|
||||
},
|
||||
|()| {
|
||||
// work loop
|
||||
do_one_iteration();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
|
||||
fn execute_all<I>(
|
||||
input: I,
|
||||
handle: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = Request>,
|
||||
{
|
||||
// just fire all requests as fast as possible
|
||||
input.into_iter().try_for_each(|req| {
|
||||
let page = req.execute(handle, manager)?;
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
anyhow::Ok(())
|
||||
})
|
||||
rt.block_on(requesters.shutdown());
|
||||
}
|
||||
|
||||
criterion_group!(benches, redo_scenarios);
|
||||
@@ -493,11 +462,7 @@ struct Request {
|
||||
}
|
||||
|
||||
impl Request {
|
||||
fn execute(
|
||||
self,
|
||||
rt: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
@@ -506,6 +471,8 @@ impl Request {
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
|
||||
manager
|
||||
.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -217,6 +217,20 @@ impl Client {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_time_travel_remote_storage(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timestamp: &str,
|
||||
done_if_after: &str,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
self.request(Method::PUT, &uri, ()).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, &uri, req).await?;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::future::join_all;
|
||||
use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::PagestreamGetPageRequest;
|
||||
@@ -10,11 +9,10 @@ use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{info, instrument};
|
||||
use tracing::info;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashSet;
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
@@ -38,8 +36,12 @@ pub(crate) struct Args {
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
/// Each client sends requests at the given rate.
|
||||
///
|
||||
/// If a request takes too long and we should be issuing a new request already,
|
||||
/// we skip that request and account it as `MISSED`.
|
||||
#[clap(long)]
|
||||
per_target_rate_limit: Option<usize>,
|
||||
per_client_rate: Option<usize>,
|
||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||
#[clap(long, default_value = "1")]
|
||||
req_latest_probability: f64,
|
||||
@@ -61,12 +63,16 @@ pub(crate) struct Args {
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
missed: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
fn request_done(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn missed(&self, n: u64) {
|
||||
self.missed.fetch_add(n, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, serde::Serialize, serde::Deserialize)]
|
||||
@@ -220,13 +226,12 @@ async fn main_impl(
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = args.num_clients.get() * timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
let num_work_sender_tasks = args.num_clients.get() * timelines.len();
|
||||
let num_main_impl = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
|
||||
num_live_stats_dump + num_work_sender_tasks + num_main_impl,
|
||||
));
|
||||
|
||||
tokio::spawn({
|
||||
@@ -238,10 +243,12 @@ async fn main_impl(
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let missed = stats.missed.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
"RPS: {:.0} MISSED: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64(),
|
||||
missed as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -249,127 +256,105 @@ async fn main_impl(
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
let rps_period = args
|
||||
.per_client_rate
|
||||
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
|
||||
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
|
||||
let live_stats = live_stats.clone();
|
||||
let start_work_barrier = start_work_barrier.clone();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == worker_id.timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights =
|
||||
rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
|
||||
.unwrap();
|
||||
|
||||
let cancel = cancel.clone();
|
||||
Box::pin(async move {
|
||||
let client =
|
||||
pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
let periods_passed_until_now =
|
||||
usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
|
||||
.unwrap();
|
||||
|
||||
if periods_passed_until_now > ticks_processed {
|
||||
live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
|
||||
}
|
||||
ticks_processed = periods_passed_until_now;
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
client.getpage(req).await.unwrap();
|
||||
let end = Instant::now();
|
||||
live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
STATS.with(|stats| {
|
||||
stats
|
||||
.borrow()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.observe(end.duration_since(start))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
if let Some(period) = &rps_period {
|
||||
let next_at = client_start
|
||||
+ Duration::from_micros(
|
||||
(ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
|
||||
);
|
||||
tokio::time::sleep_until(next_at.into()).await;
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
info!("spawning workers");
|
||||
let mut workers = JoinSet::new();
|
||||
for timeline in timelines.iter().cloned() {
|
||||
for num_client in 0..args.num_clients.get() {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
||||
let worker_id = WorkerId {
|
||||
timeline,
|
||||
num_client,
|
||||
};
|
||||
work_senders.insert(worker_id, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
worker_id,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&live_stats),
|
||||
cancel.clone(),
|
||||
)));
|
||||
workers.spawn(make_worker(worker_id));
|
||||
}
|
||||
}
|
||||
|
||||
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
|
||||
let start_work_barrier = start_work_barrier.clone();
|
||||
let cancel = cancel.clone();
|
||||
match args.per_target_rate_limit {
|
||||
None => Box::pin(async move {
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
all_ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
let (timeline, req) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &all_ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
(
|
||||
WorkerId {
|
||||
timeline: r.timeline,
|
||||
num_client: rng.gen_range(0..args.num_clients.get()),
|
||||
},
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
if sender.send(req).await.is_err() {
|
||||
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
|
||||
}
|
||||
}
|
||||
}),
|
||||
Some(rps_limit) => Box::pin(async move {
|
||||
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
|
||||
let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
|
||||
&|worker_id| {
|
||||
let sender = work_senders.get(&worker_id).unwrap();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == worker_id.timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let cancel = cancel.clone();
|
||||
Box::pin(async move {
|
||||
let mut ticker = tokio::time::interval(period);
|
||||
ticker.set_missed_tick_behavior(
|
||||
/* TODO review this choice */
|
||||
tokio::time::MissedTickBehavior::Burst,
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
ticker.tick().await;
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
if sender.send(req).await.is_err() {
|
||||
assert!(
|
||||
cancel.is_cancelled(),
|
||||
"client has gone away unexpectedly"
|
||||
);
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
join_all(tasks).await;
|
||||
}),
|
||||
let workers = async move {
|
||||
while let Some(res) = workers.join_next().await {
|
||||
res.unwrap();
|
||||
}
|
||||
};
|
||||
|
||||
let work_sender_task = tokio::spawn(work_sender);
|
||||
|
||||
info!("waiting for everything to become ready");
|
||||
start_work_barrier.wait().await;
|
||||
info!("work started");
|
||||
@@ -377,20 +362,13 @@ async fn main_impl(
|
||||
tokio::time::sleep(runtime.into()).await;
|
||||
info!("runtime over, signalling cancellation");
|
||||
cancel.cancel();
|
||||
work_sender_task.await.unwrap();
|
||||
workers.await;
|
||||
info!("work sender exited");
|
||||
} else {
|
||||
work_sender_task.await.unwrap();
|
||||
workers.await;
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
info!("joining clients");
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
info!("all clients stopped");
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = request_stats::Stats::new();
|
||||
@@ -407,49 +385,3 @@ async fn main_impl(
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
id: WorkerId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
let WorkerId {
|
||||
timeline,
|
||||
num_client: _,
|
||||
} = id;
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let do_requests = async {
|
||||
start_work_barrier.wait().await;
|
||||
while let Some(req) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
client
|
||||
.getpage(req)
|
||||
.await
|
||||
.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
};
|
||||
tokio::select! {
|
||||
res = do_requests => { res },
|
||||
_ = cancel.cancelled() => {
|
||||
// fallthrough to shutdown
|
||||
}
|
||||
}
|
||||
client.shutdown().await;
|
||||
}
|
||||
|
||||
@@ -14,8 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
}
|
||||
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
|
||||
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
|
||||
(Scope::SafekeeperData, _) => Err(AuthError(
|
||||
"SafekeeperData scope makes no sense for Pageserver".into(),
|
||||
(Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
|
||||
format!(
|
||||
"JWT scope '{:?}' is ineligible for Pageserver auth",
|
||||
claims.scope
|
||||
)
|
||||
.into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,10 +261,7 @@ where
|
||||
let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
|
||||
|
||||
for part in slru_partitions.parts {
|
||||
let blocks = self
|
||||
.timeline
|
||||
.get_vectored(&part.ranges, self.lsn, self.ctx)
|
||||
.await?;
|
||||
let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;
|
||||
|
||||
for (key, block) in blocks {
|
||||
slru_builder.add_block(&key, block?).await?;
|
||||
|
||||
@@ -33,12 +33,13 @@ use utils::{
|
||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::timeline::GetVectoredImpl;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::virtual_file;
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
@@ -84,6 +85,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -121,6 +124,8 @@ pub mod defaults {
|
||||
|
||||
#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
|
||||
|
||||
#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -256,6 +261,8 @@ pub struct PageServerConf {
|
||||
pub ingest_batch_size: u64,
|
||||
|
||||
pub virtual_file_io_engine: virtual_file::IoEngineKind,
|
||||
|
||||
pub get_vectored_impl: GetVectoredImpl,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -342,6 +349,8 @@ struct PageServerConfigBuilder {
|
||||
ingest_batch_size: BuilderValue<u64>,
|
||||
|
||||
virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
|
||||
|
||||
get_vectored_impl: BuilderValue<GetVectoredImpl>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -419,6 +428,8 @@ impl Default for PageServerConfigBuilder {
|
||||
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
||||
|
||||
virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
|
||||
|
||||
get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -579,6 +590,10 @@ impl PageServerConfigBuilder {
|
||||
self.virtual_file_io_engine = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
|
||||
self.get_vectored_impl = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
@@ -689,6 +704,9 @@ impl PageServerConfigBuilder {
|
||||
virtual_file_io_engine: self
|
||||
.virtual_file_io_engine
|
||||
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
|
||||
get_vectored_impl: self
|
||||
.get_vectored_impl
|
||||
.ok_or(anyhow!("missing get_vectored_impl"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -808,17 +826,6 @@ impl PageServerConf {
|
||||
.join(connection_id.to_string())
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Utf8PathBuf {
|
||||
self.timeline_path(tenant_shard_id, timeline_id)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Turns storage remote path of a file into its local path.
|
||||
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
|
||||
remote_path.with_base(&self.workdir)
|
||||
@@ -943,6 +950,9 @@ impl PageServerConf {
|
||||
"virtual_file_io_engine" => {
|
||||
builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
|
||||
}
|
||||
"get_vectored_impl" => {
|
||||
builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1017,6 +1027,7 @@ impl PageServerConf {
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1250,6 +1261,7 @@ background_task_maximum_delay = '334 s'
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1314,6 +1326,7 @@ background_task_maximum_delay = '334 s'
|
||||
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||
ingest_batch_size: 100,
|
||||
virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
|
||||
get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -1359,6 +1372,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
@@ -1426,6 +1440,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
max_keys_per_list_response: None,
|
||||
}),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
},
|
||||
"Remote storage config should correctly parse the S3 config"
|
||||
);
|
||||
@@ -1546,17 +1561,50 @@ threshold = "20m"
|
||||
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
|
||||
})
|
||||
);
|
||||
|
||||
match &conf.default_tenant_conf.eviction_policy {
|
||||
EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"),
|
||||
EvictionPolicy::LayerAccessThreshold(eviction_thresold) => {
|
||||
assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60));
|
||||
assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60));
|
||||
EvictionPolicy::LayerAccessThreshold(eviction_threshold) => {
|
||||
assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60));
|
||||
assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60));
|
||||
}
|
||||
other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_imitation_only_pageserver_config() {
|
||||
let tempdir = tempdir().unwrap();
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap();
|
||||
|
||||
let pageserver_conf_toml = format!(
|
||||
r#"pg_distrib_dir = "{pg_distrib_dir}"
|
||||
metric_collection_endpoint = "http://sample.url"
|
||||
metric_collection_interval = "10min"
|
||||
id = 222
|
||||
|
||||
[tenant_config]
|
||||
evictions_low_residence_duration_metric_threshold = "20m"
|
||||
|
||||
[tenant_config.eviction_policy]
|
||||
kind = "OnlyImitiate"
|
||||
period = "20m"
|
||||
threshold = "20m"
|
||||
"#,
|
||||
);
|
||||
let toml: Document = pageserver_conf_toml.parse().unwrap();
|
||||
let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
|
||||
|
||||
match &conf.default_tenant_conf.eviction_policy {
|
||||
EvictionPolicy::OnlyImitiate(t) => {
|
||||
assert_eq!(t.period, Duration::from_secs(20 * 60));
|
||||
assert_eq!(t.threshold, Duration::from_secs(20 * 60));
|
||||
}
|
||||
other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
|
||||
let tempdir_path = tempdir.path();
|
||||
|
||||
|
||||
@@ -834,7 +834,6 @@ mod test {
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
@@ -868,6 +867,7 @@ mod test {
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
|
||||
let storage_config = RemoteStorageConfig {
|
||||
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
|
||||
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
|
||||
};
|
||||
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
|
||||
|
||||
@@ -1171,6 +1171,7 @@ pub(crate) mod mock {
|
||||
pub struct ConsumerState {
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl ConsumerState {
|
||||
@@ -1184,7 +1185,7 @@ pub(crate) mod mock {
|
||||
match msg {
|
||||
DeleterMessage::Delete(objects) => {
|
||||
for path in objects {
|
||||
match remote_storage.delete(&path).await {
|
||||
match remote_storage.delete(&path, &self.cancel).await {
|
||||
Ok(_) => {
|
||||
debug!("Deleted {path}");
|
||||
}
|
||||
@@ -1217,7 +1218,7 @@ pub(crate) mod mock {
|
||||
|
||||
for path in objects {
|
||||
info!("Executing deletion {path}");
|
||||
match remote_storage.delete(&path).await {
|
||||
match remote_storage.delete(&path, &self.cancel).await {
|
||||
Ok(_) => {
|
||||
debug!("Deleted {path}");
|
||||
}
|
||||
@@ -1267,7 +1268,11 @@ pub(crate) mod mock {
|
||||
executor_tx,
|
||||
executed,
|
||||
remote_storage,
|
||||
consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
|
||||
consumer: std::sync::Mutex::new(ConsumerState {
|
||||
rx,
|
||||
executor_rx,
|
||||
cancel: CancellationToken::new(),
|
||||
}),
|
||||
lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use remote_storage::MAX_KEYS_PER_DELETE;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -71,9 +72,11 @@ impl Deleter {
|
||||
Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
|
||||
});
|
||||
|
||||
self.remote_storage.delete_objects(&self.accumulator).await
|
||||
self.remote_storage
|
||||
.delete_objects(&self.accumulator, &self.cancel)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
3,
|
||||
10,
|
||||
"executing deletion batch",
|
||||
|
||||
@@ -351,7 +351,6 @@ pub enum IterationOutcome<U> {
|
||||
Finished(IterationOutcomeFinished<U>),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct IterationOutcomeFinished<U> {
|
||||
/// The actual usage observed before we started the iteration.
|
||||
@@ -366,7 +365,6 @@ pub struct IterationOutcomeFinished<U> {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[allow(dead_code)]
|
||||
struct AssumedUsage<U> {
|
||||
/// The expected value for `after`, after phase 2.
|
||||
projected_after: U,
|
||||
@@ -374,14 +372,12 @@ struct AssumedUsage<U> {
|
||||
failed: LayerCount,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
struct PlannedUsage<U> {
|
||||
respecting_tenant_min_resident_size: U,
|
||||
fallback_to_global_lru: Option<U>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
struct LayerCount {
|
||||
file_sizes: u64,
|
||||
@@ -565,7 +561,6 @@ pub(crate) struct EvictionSecondaryLayer {
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum EvictionLayer {
|
||||
Attached(Layer),
|
||||
#[allow(dead_code)]
|
||||
Secondary(EvictionSecondaryLayer),
|
||||
}
|
||||
|
||||
@@ -1105,7 +1100,6 @@ mod filesystem_level_usage {
|
||||
use super::DiskUsageEvictionTaskConfig;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[allow(dead_code)]
|
||||
pub struct Usage<'a> {
|
||||
config: &'a DiskUsageEvictionTaskConfig,
|
||||
|
||||
|
||||
@@ -1379,6 +1379,25 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/utilization:
|
||||
get:
|
||||
description: |
|
||||
Returns the pageservers current utilization and fitness score for new tenants.
|
||||
|
||||
responses:
|
||||
"200":
|
||||
description: Pageserver utilization and fitness score
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PageserverUtilization"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
@@ -1691,6 +1710,33 @@ components:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
|
||||
PageserverUtilization:
|
||||
type: object
|
||||
required:
|
||||
- disk_usage_bytes
|
||||
- free_space_bytes
|
||||
- utilization_score
|
||||
properties:
|
||||
disk_usage_bytes:
|
||||
type: integer
|
||||
format: int64
|
||||
minimum: 0
|
||||
description: The amount of disk space currently utilized by layer files.
|
||||
free_space_bytes:
|
||||
type: integer
|
||||
format: int64
|
||||
minimum: 0
|
||||
description: The amount of usable disk space left.
|
||||
utilization_score:
|
||||
type: integer
|
||||
format: int64
|
||||
minimum: 0
|
||||
maximum: 9223372036854775807
|
||||
default: 9223372036854775807
|
||||
description: |
|
||||
Lower is better score for how good this pageserver would be for the next tenant.
|
||||
The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
|
||||
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -83,12 +83,12 @@ use utils::{
|
||||
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
||||
// failed API calls while tenants are activating.
|
||||
#[cfg(not(feature = "testing"))]
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
|
||||
// finish attaching, if calls to remote storage are slow.
|
||||
#[cfg(feature = "testing")]
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
|
||||
pub struct State {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -100,6 +100,7 @@ pub struct State {
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
latest_utilization: tokio::sync::Mutex<Option<(std::time::Instant, bytes::Bytes)>>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -128,6 +129,7 @@ impl State {
|
||||
disk_usage_eviction_state,
|
||||
deletion_queue_client,
|
||||
secondary_controller,
|
||||
latest_utilization: Default::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -571,10 +573,16 @@ async fn timeline_list_handler(
|
||||
parse_query_param(&request, "force-await-initial-logical-size")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let response_data = async {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
@@ -616,7 +624,7 @@ async fn timeline_preserve_initdb_handler(
|
||||
// location where timeline recreation cand find it.
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
|
||||
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -1136,7 +1144,7 @@ async fn tenant_shard_split_handler(
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
|
||||
.shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1951,11 +1959,60 @@ async fn put_io_engine_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
|
||||
crate::virtual_file::io_engine::set(kind);
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Polled by control plane.
|
||||
///
|
||||
/// See [`crate::utilization`].
|
||||
async fn get_utilization(
|
||||
r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
// this probably could be completely public, but lets make that change later.
|
||||
check_permission(&r, None)?;
|
||||
|
||||
let state = get_state(&r);
|
||||
let mut g = state.latest_utilization.lock().await;
|
||||
|
||||
let regenerate_every = Duration::from_secs(1);
|
||||
let still_valid = g
|
||||
.as_ref()
|
||||
.is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every);
|
||||
|
||||
// avoid needless statvfs calls even though those should be non-blocking fast.
|
||||
// regenerate at most 1Hz to allow polling at any rate.
|
||||
if !still_valid {
|
||||
let path = state.conf.tenants_path();
|
||||
let doc = crate::utilization::regenerate(path.as_std_path())
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut buf = Vec::new();
|
||||
serde_json::to_writer(&mut buf, &doc)
|
||||
.context("serialize")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let body = bytes::Bytes::from(buf);
|
||||
|
||||
*g = Some((std::time::Instant::now(), body));
|
||||
}
|
||||
|
||||
// hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork
|
||||
let cached = g.as_ref().expect("just set").1.clone();
|
||||
|
||||
Response::builder()
|
||||
.header(hyper::http::header::CONTENT_TYPE, "application/json")
|
||||
// thought of using http date header, but that is second precision which does not give any
|
||||
// debugging aid
|
||||
.status(StatusCode::OK)
|
||||
.body(hyper::Body::from(cached))
|
||||
.context("build response")
|
||||
.map_err(ApiError::InternalServerError)
|
||||
}
|
||||
|
||||
/// Common functionality of all the HTTP API handlers.
|
||||
///
|
||||
/// - Adds a tracing span to each request (by `request_span`)
|
||||
@@ -2214,8 +2271,9 @@ pub fn make_router(
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
|
||||
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.get("/v1/utilization", |r| api_handler(r, get_utilization))
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
pub mod trace;
|
||||
pub mod utilization;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walrecord;
|
||||
@@ -168,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
|
||||
if let Some(e) = e.io_error() {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
|
||||
/// blocking.
|
||||
///
|
||||
|
||||
@@ -4,8 +4,8 @@ use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
||||
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
|
||||
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
|
||||
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
|
||||
IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -1266,13 +1266,12 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
|
||||
// remote storage metrics
|
||||
|
||||
/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
"Number of ongoing calls to remote timeline client. \
|
||||
Used to populate pageserver_remote_timeline_client_calls_started. \
|
||||
This metric is not useful for sampling from Prometheus, but useful in tests.",
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
"pageserver_remote_timeline_client_calls_started",
|
||||
"Number of started calls to remote timeline client.",
|
||||
"pageserver_remote_timeline_client_calls_finished",
|
||||
"Number of finshed calls to remote timeline client.",
|
||||
&[
|
||||
"tenant_id",
|
||||
"shard_id",
|
||||
@@ -1281,23 +1280,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
|
||||
"op_kind"
|
||||
],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_timeline_client_calls_started",
|
||||
"When calling a remote timeline client method, we record the current value \
|
||||
of the calls_unfinished gauge in this histogram. Plot the histogram \
|
||||
over time in a heatmap to visualize how many operations were ongoing \
|
||||
at a given instant. It gives you a better idea of the queue depth \
|
||||
than plotting the gauge directly, since operations may complete faster \
|
||||
than the sampling interval.",
|
||||
&["file_kind", "op_kind"],
|
||||
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
|
||||
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
|
||||
@@ -2078,7 +2061,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
|
||||
shard_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
}
|
||||
@@ -2089,7 +2072,7 @@ impl RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_shard_id.tenant_id.to_string(),
|
||||
shard_id: format!("{}", tenant_shard_id.shard_slug()),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||
calls: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
@@ -2129,15 +2112,15 @@ impl RemoteTimelineClientMetrics {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn calls_unfinished_gauge(
|
||||
fn calls_counter_pair(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> IntGauge {
|
||||
let mut guard = self.calls_unfinished_gauge.lock().unwrap();
|
||||
) -> IntCounterPair {
|
||||
let mut guard = self.calls.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
|
||||
REMOTE_TIMELINE_CLIENT_CALLS
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id,
|
||||
&self.shard_id,
|
||||
@@ -2150,17 +2133,6 @@ impl RemoteTimelineClientMetrics {
|
||||
metric.clone()
|
||||
}
|
||||
|
||||
fn calls_started_hist(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Histogram {
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[key.0, key.1])
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn bytes_started_counter(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
@@ -2231,7 +2203,7 @@ impl RemoteTimelineClientMetrics {
|
||||
#[must_use]
|
||||
pub(crate) struct RemoteTimelineClientCallMetricGuard {
|
||||
/// Decremented on drop.
|
||||
calls_unfinished_metric: Option<IntGauge>,
|
||||
calls_counter_pair: Option<IntCounterPair>,
|
||||
/// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
|
||||
bytes_finished: Option<(IntCounter, u64)>,
|
||||
}
|
||||
@@ -2241,10 +2213,10 @@ impl RemoteTimelineClientCallMetricGuard {
|
||||
/// The caller vouches to do the metric updates manually.
|
||||
pub fn will_decrement_manually(mut self) {
|
||||
let RemoteTimelineClientCallMetricGuard {
|
||||
calls_unfinished_metric,
|
||||
calls_counter_pair,
|
||||
bytes_finished,
|
||||
} = &mut self;
|
||||
calls_unfinished_metric.take();
|
||||
calls_counter_pair.take();
|
||||
bytes_finished.take();
|
||||
}
|
||||
}
|
||||
@@ -2252,10 +2224,10 @@ impl RemoteTimelineClientCallMetricGuard {
|
||||
impl Drop for RemoteTimelineClientCallMetricGuard {
|
||||
fn drop(&mut self) {
|
||||
let RemoteTimelineClientCallMetricGuard {
|
||||
calls_unfinished_metric,
|
||||
calls_counter_pair,
|
||||
bytes_finished,
|
||||
} = self;
|
||||
if let Some(guard) = calls_unfinished_metric.take() {
|
||||
if let Some(guard) = calls_counter_pair.take() {
|
||||
guard.dec();
|
||||
}
|
||||
if let Some((bytes_finished_metric, value)) = bytes_finished {
|
||||
@@ -2288,10 +2260,8 @@ impl RemoteTimelineClientMetrics {
|
||||
op_kind: &RemoteOpKind,
|
||||
size: RemoteTimelineClientMetricsCallTrackSize,
|
||||
) -> RemoteTimelineClientCallMetricGuard {
|
||||
let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
self.calls_started_hist(file_kind, op_kind)
|
||||
.observe(calls_unfinished_metric.get() as f64);
|
||||
calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
|
||||
let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
|
||||
calls_counter_pair.inc();
|
||||
|
||||
let bytes_finished = match size {
|
||||
RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
|
||||
@@ -2305,7 +2275,7 @@ impl RemoteTimelineClientMetrics {
|
||||
}
|
||||
};
|
||||
RemoteTimelineClientCallMetricGuard {
|
||||
calls_unfinished_metric: Some(calls_unfinished_metric),
|
||||
calls_counter_pair: Some(calls_counter_pair),
|
||||
bytes_finished,
|
||||
}
|
||||
}
|
||||
@@ -2319,12 +2289,8 @@ impl RemoteTimelineClientMetrics {
|
||||
op_kind: &RemoteOpKind,
|
||||
size: RemoteTimelineClientMetricsCallTrackSize,
|
||||
) {
|
||||
let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
|
||||
debug_assert!(
|
||||
calls_unfinished_metric.get() > 0,
|
||||
"begin and end should cancel out"
|
||||
);
|
||||
calls_unfinished_metric.dec();
|
||||
let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
|
||||
calls_counter_pair.dec();
|
||||
match size {
|
||||
RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
|
||||
RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
|
||||
@@ -2341,18 +2307,15 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
remote_physical_size_gauge,
|
||||
calls_unfinished_gauge,
|
||||
calls,
|
||||
bytes_started_counter,
|
||||
bytes_finished_counter,
|
||||
} = self;
|
||||
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
for ((a, b), _) in calls.get_mut().unwrap().drain() {
|
||||
let mut res = [Ok(()), Ok(())];
|
||||
REMOTE_TIMELINE_CLIENT_CALLS
|
||||
.remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
|
||||
// don't care about results
|
||||
}
|
||||
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
||||
@@ -2496,6 +2459,56 @@ pub mod tokio_epoll_uring {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) mod tenant_throttling {
|
||||
use metrics::{register_int_counter_vec, IntCounter};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use crate::tenant::{self, throttle::Metric};
|
||||
|
||||
pub(crate) struct TimelineGet {
|
||||
wait_time: IntCounter,
|
||||
count: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
|
||||
static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_wait_usecs_sum_global",
|
||||
"Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_throttling_count_global",
|
||||
"Count of tenant throttlings, by kind of throttle.",
|
||||
&["kind"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
let kind = "timeline_get";
|
||||
TimelineGet {
|
||||
wait_time: WAIT_USECS.with_label_values(&[kind]),
|
||||
count: WAIT_COUNT.with_label_values(&[kind]),
|
||||
}
|
||||
});
|
||||
|
||||
impl Metric for &'static TimelineGet {
|
||||
#[inline(always)]
|
||||
fn observe_throttling(
|
||||
&self,
|
||||
tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
|
||||
) {
|
||||
let val = u64::try_from(wait_time.as_micros()).unwrap();
|
||||
self.wait_time.inc_by(val);
|
||||
self.count.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
@@ -2557,4 +2570,5 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
// Custom
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&tenant_throttling::TIMELINE_GET);
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ use pageserver_api::models::{
|
||||
PagestreamNblocksResponse,
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber};
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
@@ -998,7 +998,7 @@ impl PageServerHandler {
|
||||
) -> Result<&Arc<Timeline>, Key> {
|
||||
let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
|
||||
// Fastest path: single sharded case
|
||||
if first_idx.shard_count < ShardCount(2) {
|
||||
if first_idx.shard_count.count() == 1 {
|
||||
return Ok(&first_timeline.timeline);
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -156,6 +157,7 @@ impl Timeline {
|
||||
pending_updates: HashMap::new(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_aux_files: None,
|
||||
pending_directory_entries: Vec::new(),
|
||||
lsn,
|
||||
}
|
||||
@@ -870,6 +872,14 @@ pub struct DatadirModification<'a> {
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
|
||||
// If we already wrote any aux file changes in this modification, stash the latest dir. If set,
|
||||
// [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
|
||||
// if AUX_FILES_KEY is already set.
|
||||
pending_aux_files: Option<AuxFilesDirectory>,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
}
|
||||
|
||||
@@ -1384,31 +1394,76 @@ impl<'a> DatadirModification<'a> {
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(buf) => AuxFilesDirectory::des(&buf)?,
|
||||
Err(e) => {
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
let file_path = path.to_string();
|
||||
let content = if content.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
|
||||
let dir = if let Some(mut dir) = self.pending_aux_files.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value
|
||||
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
|
||||
dir.upsert(file_path, content);
|
||||
dir
|
||||
} else {
|
||||
// Check if the AUX_FILES_KEY is initialized
|
||||
match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(dir_bytes) => {
|
||||
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
|
||||
// Key is already set, we may append a delta
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::WalRecord(NeonWalRecord::AuxFile {
|
||||
file_path: file_path.clone(),
|
||||
content: content.clone(),
|
||||
}),
|
||||
);
|
||||
dir.upsert(file_path, content);
|
||||
dir
|
||||
}
|
||||
Err(
|
||||
e @ (PageReconstructError::AncestorStopping(_)
|
||||
| PageReconstructError::Cancelled
|
||||
| PageReconstructError::AncestorLsnTimeout(_)),
|
||||
) => {
|
||||
// Important that we do not interpret a shutdown error as "not found" and thereby
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
|
||||
// we are assuming that all _other_ possible errors represents a missing key. If some
|
||||
// other error occurs, we may incorrectly reset the map of aux files.
|
||||
Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
let mut dir = AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
};
|
||||
dir.upsert(file_path, content);
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
dir
|
||||
}
|
||||
}
|
||||
};
|
||||
let path = path.to_string();
|
||||
if content.is_empty() {
|
||||
dir.files.remove(&path);
|
||||
} else {
|
||||
dir.files.insert(path, Bytes::copy_from_slice(content));
|
||||
}
|
||||
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::AuxFiles, dir.files.len()));
|
||||
self.pending_aux_files = Some(dir);
|
||||
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1438,7 +1493,7 @@ impl<'a> DatadirModification<'a> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let writer = self.tline.writer().await;
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
@@ -1477,13 +1532,23 @@ impl<'a> DatadirModification<'a> {
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
let writer = self.tline.writer().await;
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
writer.put_batch(&self.pending_updates, ctx).await?;
|
||||
let prev_pending_updates = std::mem::take(&mut self.pending_updates);
|
||||
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: Vec<(Key, Lsn, Value)> = prev_pending_updates
|
||||
.into_iter()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (key, lsn, val)))
|
||||
.kmerge_by(|lhs, rhs| lhs.1 .0 < rhs.1 .0)
|
||||
.collect();
|
||||
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
self.pending_updates.clear();
|
||||
}
|
||||
|
||||
@@ -1618,8 +1683,18 @@ struct RelDirectory {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct AuxFilesDirectory {
|
||||
files: HashMap<String, Bytes>,
|
||||
pub(crate) struct AuxFilesDirectory {
|
||||
pub(crate) files: HashMap<String, Bytes>,
|
||||
}
|
||||
|
||||
impl AuxFilesDirectory {
|
||||
pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
|
||||
if let Some(value) = value {
|
||||
self.files.insert(key, value);
|
||||
} else {
|
||||
self.files.remove(&key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@@ -1655,8 +1730,60 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
//use super::repo_harness::*;
|
||||
//use super::*;
|
||||
use hex_literal::hex;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
|
||||
|
||||
/// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
|
||||
#[tokio::test]
|
||||
async fn aux_files_round_trip() -> anyhow::Result<()> {
|
||||
let name = "aux_files_round_trip";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
|
||||
pub const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
// First modification: insert two keys
|
||||
let mut modification = tline.begin_modification(Lsn(0x1000));
|
||||
modification.put_file("foo/bar1", b"content1", &ctx).await?;
|
||||
modification.set_lsn(Lsn(0x1008))?;
|
||||
modification.put_file("foo/bar2", b"content2", &ctx).await?;
|
||||
modification.commit(&ctx).await?;
|
||||
let expect_1008 = HashMap::from([
|
||||
("foo/bar1".to_string(), Bytes::from_static(b"content1")),
|
||||
("foo/bar2".to_string(), Bytes::from_static(b"content2")),
|
||||
]);
|
||||
|
||||
let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
|
||||
assert_eq!(readback, expect_1008);
|
||||
|
||||
// Second modification: update one key, remove the other
|
||||
let mut modification = tline.begin_modification(Lsn(0x2000));
|
||||
modification.put_file("foo/bar1", b"content3", &ctx).await?;
|
||||
modification.set_lsn(Lsn(0x2008))?;
|
||||
modification.put_file("foo/bar2", b"", &ctx).await?;
|
||||
modification.commit(&ctx).await?;
|
||||
let expect_2008 =
|
||||
HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
|
||||
|
||||
let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
|
||||
assert_eq!(readback, expect_2008);
|
||||
|
||||
// Reading back in time works
|
||||
let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
|
||||
assert_eq!(readback, expect_1008);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*
|
||||
fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
|
||||
|
||||
@@ -30,10 +30,6 @@
|
||||
//! only a single tenant or timeline.
|
||||
//!
|
||||
|
||||
// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
|
||||
// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
|
||||
#![allow(clippy::declare_interior_mutable_const)]
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::future::Future;
|
||||
@@ -192,6 +188,7 @@ task_local! {
|
||||
serde::Serialize,
|
||||
serde::Deserialize,
|
||||
strum_macros::IntoStaticStr,
|
||||
strum_macros::EnumString,
|
||||
)]
|
||||
pub enum TaskKind {
|
||||
// Pageserver startup, i.e., `main`
|
||||
@@ -312,7 +309,6 @@ struct MutableTaskState {
|
||||
}
|
||||
|
||||
struct PageServerTask {
|
||||
#[allow(dead_code)] // unused currently
|
||||
task_id: PageserverTaskId,
|
||||
|
||||
kind: TaskKind,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -9,8 +9,8 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use anyhow::bail;
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::de::IntoDeserializer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -251,7 +251,7 @@ impl LocationConf {
|
||||
} else {
|
||||
ShardIdentity::new(
|
||||
ShardNumber(conf.shard_number),
|
||||
ShardCount(conf.shard_count),
|
||||
ShardCount::new(conf.shard_count),
|
||||
ShardStripeSize(conf.shard_stripe_size),
|
||||
)?
|
||||
};
|
||||
@@ -285,7 +285,7 @@ impl Default for LocationConf {
|
||||
///
|
||||
/// For storing and transmitting individual tenant's configuration, see
|
||||
/// TenantConfOpt.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct TenantConf {
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
@@ -348,11 +348,13 @@ pub struct TenantConf {
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
pub lazy_slru_download: bool,
|
||||
|
||||
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
/// which parameters are set and which are not.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
@@ -437,6 +439,9 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -485,6 +490,10 @@ impl TenantConfOpt {
|
||||
lazy_slru_download: self
|
||||
.lazy_slru_download
|
||||
.unwrap_or(global_conf.lazy_slru_download),
|
||||
timeline_get_throttle: self
|
||||
.timeline_get_throttle
|
||||
.clone()
|
||||
.unwrap_or(global_conf.timeline_get_throttle),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -524,6 +533,7 @@ impl Default for TenantConf {
|
||||
gc_feedback: false,
|
||||
heatmap_period: Duration::ZERO,
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -596,6 +606,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
gc_feedback: value.gc_feedback,
|
||||
heatmap_period: value.heatmap_period.map(humantime),
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::sync::Arc;
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::{models::TenantState, shard::TenantShardId};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, instrument, Instrument};
|
||||
@@ -84,17 +84,17 @@ async fn create_remote_delete_mark(
|
||||
let data = bytes::Bytes::from_static(data);
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data)));
|
||||
remote_storage
|
||||
.upload(stream, 0, &remote_mark_path, None)
|
||||
.upload(stream, 0, &remote_mark_path, None, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("mark_upload")?;
|
||||
|
||||
@@ -184,15 +184,15 @@ async fn remove_tenant_remote_delete_mark(
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path).await },
|
||||
|_e| false,
|
||||
|| async { remote_storage.delete(&path, cancel).await },
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
}
|
||||
@@ -246,6 +246,8 @@ async fn cleanup_remaining_fs_traces(
|
||||
|
||||
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
|
||||
|
||||
rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-tenant-dir"
|
||||
|
||||
@@ -36,7 +36,6 @@ use crate::{
|
||||
pub const VALUE_SZ: usize = 5;
|
||||
pub const MAX_VALUE: u64 = 0x007f_ffff_ffff;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub const PAGE_SZ: usize = 8192;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
|
||||
@@ -6,6 +6,7 @@ use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use bytes::BytesMut;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::cmp::min;
|
||||
@@ -26,7 +27,10 @@ pub struct EphemeralFile {
|
||||
/// An ephemeral file is append-only.
|
||||
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
|
||||
/// The other pages, which can no longer be modified, are accessed through the page cache.
|
||||
mutable_tail: [u8; PAGE_SZ],
|
||||
///
|
||||
/// None <=> IO is ongoing.
|
||||
/// Size is fixed to PAGE_SZ at creation time and must not be changed.
|
||||
mutable_tail: Option<BytesMut>,
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
@@ -60,7 +64,7 @@ impl EphemeralFile {
|
||||
_timeline_id: timeline_id,
|
||||
file,
|
||||
len: 0,
|
||||
mutable_tail: [0u8; PAGE_SZ],
|
||||
mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -103,7 +107,13 @@ impl EphemeralFile {
|
||||
};
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
|
||||
Ok(BlockLease::EphemeralFileMutableTail(
|
||||
self.mutable_tail
|
||||
.as_deref()
|
||||
.expect("we're not doing IO, it must be Some()")
|
||||
.try_into()
|
||||
.expect("we ensure that it's always PAGE_SZ"),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,21 +145,27 @@ impl EphemeralFile {
|
||||
) -> Result<(), io::Error> {
|
||||
let mut src_remaining = src;
|
||||
while !src_remaining.is_empty() {
|
||||
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
|
||||
let dst_remaining = &mut self
|
||||
.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref_mut()
|
||||
.expect("IO is not yet ongoing")[self.off..];
|
||||
let n = min(dst_remaining.len(), src_remaining.len());
|
||||
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
|
||||
self.off += n;
|
||||
src_remaining = &src_remaining[n..];
|
||||
if self.off == PAGE_SZ {
|
||||
match self
|
||||
let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
|
||||
.expect("IO is not yet ongoing");
|
||||
let (mutable_tail, res) = self
|
||||
.ephemeral_file
|
||||
.file
|
||||
.write_all_at(
|
||||
&self.ephemeral_file.mutable_tail,
|
||||
self.blknum as u64 * PAGE_SZ as u64,
|
||||
)
|
||||
.await
|
||||
{
|
||||
.write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
|
||||
.await;
|
||||
// TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
|
||||
// I.e., the IO isn't retryable if we panic.
|
||||
self.ephemeral_file.mutable_tail = Some(mutable_tail);
|
||||
match res {
|
||||
Ok(_) => {
|
||||
// Pre-warm the page cache with what we just wrote.
|
||||
// This isn't necessary for coherency/correctness, but it's how we've always done it.
|
||||
@@ -169,7 +185,12 @@ impl EphemeralFile {
|
||||
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
|
||||
buf.copy_from_slice(
|
||||
self.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref()
|
||||
.expect("IO is not ongoing"),
|
||||
);
|
||||
let _ = write_guard.mark_valid();
|
||||
// pre-warm successful
|
||||
}
|
||||
@@ -181,7 +202,11 @@ impl EphemeralFile {
|
||||
// Zero the buffer for re-use.
|
||||
// Zeroing is critical for correcntess because the write_blob code below
|
||||
// and similarly read_blk expect zeroed pages.
|
||||
self.ephemeral_file.mutable_tail.fill(0);
|
||||
self.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref_mut()
|
||||
.expect("IO is not ongoing")
|
||||
.fill(0);
|
||||
// This block is done, move to next one.
|
||||
self.blknum += 1;
|
||||
self.off = 0;
|
||||
|
||||
@@ -52,8 +52,7 @@ use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
@@ -147,43 +146,28 @@ impl Drop for BatchedUpdates<'_> {
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
#[derive(Eq, PartialEq, Debug, Hash)]
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<PersistentLayerDesc>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
pub struct OrderedSearchResult(SearchResult);
|
||||
|
||||
impl Ord for OrderedSearchResult {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.0.lsn_floor.cmp(&other.0.lsn_floor)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for OrderedSearchResult {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for OrderedSearchResult {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.lsn_floor == other.0.lsn_floor
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OrderedSearchResult {}
|
||||
|
||||
/// Return value of [`LayerMap::range_search`]
|
||||
///
|
||||
/// Contains a mapping from a layer description to a keyspace
|
||||
/// accumulator that contains all the keys which intersect the layer
|
||||
/// from the original search space. Keys that were not found are accumulated
|
||||
/// in a separate key space accumulator.
|
||||
#[derive(Debug)]
|
||||
pub struct RangeSearchResult {
|
||||
pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
|
||||
pub found: HashMap<SearchResult, KeySpaceAccum>,
|
||||
pub not_found: KeySpaceAccum,
|
||||
}
|
||||
|
||||
impl RangeSearchResult {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
found: BTreeMap::new(),
|
||||
found: HashMap::new(),
|
||||
not_found: KeySpaceAccum::new(),
|
||||
}
|
||||
}
|
||||
@@ -314,7 +298,7 @@ where
|
||||
Some(search_result) => self
|
||||
.result
|
||||
.found
|
||||
.entry(OrderedSearchResult(search_result))
|
||||
.entry(search_result)
|
||||
.or_default()
|
||||
.add_range(covered_range),
|
||||
None => self.pad_range(covered_range),
|
||||
@@ -362,6 +346,35 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub enum InMemoryLayerHandle {
|
||||
Open {
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
Frozen {
|
||||
idx: usize,
|
||||
lsn_floor: Lsn,
|
||||
end_lsn: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
impl InMemoryLayerHandle {
|
||||
pub fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
|
||||
InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_end_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
|
||||
InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
@@ -556,6 +569,43 @@ impl LayerMap {
|
||||
self.historic.iter()
|
||||
}
|
||||
|
||||
/// Get a handle for the first in memory layer that matches the provided predicate.
|
||||
/// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
|
||||
///
|
||||
/// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
|
||||
/// the same exclusive region established by holding the layer manager lock.
|
||||
pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
|
||||
where
|
||||
Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
|
||||
{
|
||||
if let Some(open) = &self.open_layer {
|
||||
if pred(open) {
|
||||
return Some(InMemoryLayerHandle::Open {
|
||||
lsn_floor: open.get_lsn_range().start,
|
||||
end_lsn: open.get_lsn_range().end,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let pos = self.frozen_layers.iter().rev().position(pred);
|
||||
pos.map(|rev_idx| {
|
||||
let idx = self.frozen_layers.len() - 1 - rev_idx;
|
||||
InMemoryLayerHandle::Frozen {
|
||||
idx,
|
||||
lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
|
||||
end_lsn: self.frozen_layers[idx].get_lsn_range().end,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the layer pointed to by the provided handle.
|
||||
pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
|
||||
match handle {
|
||||
InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
|
||||
InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Divide the whole given range of keys into sub-ranges based on the latest
|
||||
/// image layer that covers each range at the specified lsn (inclusive).
|
||||
@@ -869,6 +919,8 @@ impl LayerMap {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -895,15 +947,15 @@ mod tests {
|
||||
|
||||
fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
|
||||
assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
|
||||
let lhs: Vec<_> = lhs
|
||||
let lhs: HashMap<SearchResult, KeySpace> = lhs
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
|
||||
.map(|(search_result, accum)| (search_result, accum.to_keyspace()))
|
||||
.collect();
|
||||
let rhs: Vec<_> = rhs
|
||||
let rhs: HashMap<SearchResult, KeySpace> = rhs
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
|
||||
.map(|(search_result, accum)| (search_result, accum.to_keyspace()))
|
||||
.collect();
|
||||
|
||||
assert_eq!(lhs, rhs);
|
||||
@@ -923,7 +975,7 @@ mod tests {
|
||||
Some(res) => {
|
||||
range_search_result
|
||||
.found
|
||||
.entry(OrderedSearchResult(res))
|
||||
.entry(res)
|
||||
.or_default()
|
||||
.add_key(key);
|
||||
}
|
||||
|
||||
@@ -8,20 +8,11 @@
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
|
||||
use std::io::{self};
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use anyhow::ensure;
|
||||
use serde::{de::Error, Deserialize, Serialize, Serializer};
|
||||
use thiserror::Error;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
|
||||
/// Use special format number to enable backward compatibility.
|
||||
const METADATA_FORMAT_VERSION: u16 = 4;
|
||||
|
||||
@@ -268,43 +259,6 @@ impl Serialize for TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
|
||||
pub async fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
data: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
|
||||
.await
|
||||
.context("write metadata")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum LoadMetadataError {
|
||||
#[error(transparent)]
|
||||
Read(#[from] io::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
Decode(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Result<TimelineMetadata, LoadMetadataError> {
|
||||
let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||
let metadata_bytes = std::fs::read(metadata_path)?;
|
||||
|
||||
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
@@ -31,6 +32,7 @@ use crate::control_plane_client::{
|
||||
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
|
||||
};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
|
||||
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
@@ -40,7 +42,7 @@ use crate::tenant::config::{
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::fs_ext::PathExt;
|
||||
@@ -357,12 +359,6 @@ fn load_tenant_config(
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tenant_shard_id = match tenant_dir_path
|
||||
.file_name()
|
||||
.unwrap_or_default()
|
||||
@@ -375,6 +371,59 @@ fn load_tenant_config(
|
||||
}
|
||||
};
|
||||
|
||||
// Clean up legacy `metadata` files.
|
||||
// Doing it here because every single tenant directory is visited here.
|
||||
// In any later code, there's different treatment of tenant dirs
|
||||
// ... depending on whether the tenant is in re-attach response or not
|
||||
// ... epending on whether the tenant is ignored or not
|
||||
assert_eq!(
|
||||
&conf.tenant_path(&tenant_shard_id),
|
||||
&tenant_dir_path,
|
||||
"later use of conf....path() methods would be dubious"
|
||||
);
|
||||
let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
|
||||
Ok(iter) => {
|
||||
let mut timelines = Vec::new();
|
||||
for res in iter {
|
||||
let p = res?;
|
||||
let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
|
||||
// skip any entries that aren't TimelineId, such as
|
||||
// - *.___temp dirs
|
||||
// - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
|
||||
continue;
|
||||
};
|
||||
timelines.push(timeline_id);
|
||||
}
|
||||
timelines
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
|
||||
Err(e) => return Err(anyhow::anyhow!(e)),
|
||||
};
|
||||
for timeline_id in timelines {
|
||||
let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
let metadata_path = timeline_path.join(METADATA_FILE_NAME);
|
||||
match std::fs::remove_file(&metadata_path) {
|
||||
Ok(()) => {
|
||||
crashsafe::fsync(timeline_path)
|
||||
.context("fsync timeline dir after removing legacy metadata file")?;
|
||||
info!("removed legacy metadata file at {metadata_path}");
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
// something removed the file earlier, or it was never there
|
||||
// We don't care, this software version doesn't write it again, so, we're good.
|
||||
}
|
||||
Err(e) => {
|
||||
anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some((
|
||||
tenant_shard_id,
|
||||
Tenant::load_tenant_config(conf, &tenant_shard_id),
|
||||
@@ -483,7 +532,7 @@ pub async fn init_tenant_mgr(
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
location_conf.shard,
|
||||
location_conf.tenant_conf,
|
||||
location_conf.tenant_conf.clone(),
|
||||
&SecondaryLocationConfig { warm: false },
|
||||
)),
|
||||
);
|
||||
@@ -793,7 +842,7 @@ pub(crate) async fn set_new_tenant_config(
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
if tenant.tenant_shard_id().shard_count > ShardCount(0) {
|
||||
if !tenant.tenant_shard_id().shard_count.is_unsharded() {
|
||||
// Note that we use ShardParameters::default below.
|
||||
return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
|
||||
"This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
|
||||
@@ -804,7 +853,7 @@ pub(crate) async fn set_new_tenant_config(
|
||||
// API to use is the location_config/ endpoint, which lets the caller provide
|
||||
// the full LocationConf.
|
||||
let location_conf = LocationConf::attached_single(
|
||||
new_tenant_conf,
|
||||
new_tenant_conf.clone(),
|
||||
tenant.generation,
|
||||
&ShardParameters::default(),
|
||||
);
|
||||
@@ -1375,7 +1424,7 @@ impl TenantManager {
|
||||
result
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
pub(crate) async fn shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -1385,11 +1434,10 @@ impl TenantManager {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
|
||||
if new_shard_count <= ShardCount(effective_old_shard_count) {
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
anyhow::bail!("Requested shard count is not an increase");
|
||||
}
|
||||
let expansion_factor = new_shard_count.0 / effective_old_shard_count;
|
||||
let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count();
|
||||
if !expansion_factor.is_power_of_two() {
|
||||
anyhow::bail!("Requested split is not a power of two");
|
||||
}
|
||||
@@ -1439,8 +1487,10 @@ impl TenantManager {
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
|
||||
// TODO: erase the dentries from the parent
|
||||
// Optimization: hardlink layers from the parent into the children, so that they don't have to
|
||||
// re-download & duplicate the data referenced in their initial IndexPart
|
||||
self.shard_split_hardlink(parent, child_shards.clone())
|
||||
.await?;
|
||||
|
||||
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||
// child shards to reach this point.
|
||||
@@ -1464,7 +1514,7 @@ impl TenantManager {
|
||||
attach_mode: AttachmentMode::Single,
|
||||
}),
|
||||
shard: child_shard_identity,
|
||||
tenant_conf: parent_tenant_conf,
|
||||
tenant_conf: parent_tenant_conf.clone(),
|
||||
};
|
||||
|
||||
self.upsert_location(
|
||||
@@ -1479,13 +1529,24 @@ impl TenantManager {
|
||||
|
||||
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard_id = *child_shard_id;
|
||||
let child_shard = {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
let peek_slot =
|
||||
tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
|
||||
tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
|
||||
peek_slot.and_then(|s| s.get_attached()).cloned()
|
||||
};
|
||||
if let Some(t) = child_shard {
|
||||
// Wait for the child shard to become active: this should be very quick because it only
|
||||
// has to download the index_part that we just uploaded when creating it.
|
||||
if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await {
|
||||
// This is not fatal: we have durably created the child shard. It just makes the
|
||||
// split operation less seamless for clients, as we will may detach the parent
|
||||
// shard before the child shards are fully ready to serve requests.
|
||||
tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}");
|
||||
continue;
|
||||
}
|
||||
|
||||
let timelines = t.timelines.lock().unwrap().clone();
|
||||
for timeline in timelines.values() {
|
||||
let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
|
||||
@@ -1517,7 +1578,7 @@ impl TenantManager {
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 5: Shut down the parent shard.
|
||||
// Phase 5: Shut down the parent shard, and erase it from disk
|
||||
let (_guard, progress) = completion::channel();
|
||||
match parent.shutdown(progress, false).await {
|
||||
Ok(()) => {}
|
||||
@@ -1525,6 +1586,24 @@ impl TenantManager {
|
||||
other.wait().await;
|
||||
}
|
||||
}
|
||||
let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
|
||||
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
None,
|
||||
None,
|
||||
"tenant_files_delete",
|
||||
false,
|
||||
async move {
|
||||
fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
|
||||
},
|
||||
);
|
||||
|
||||
parent_slot_guard.drop_old_value()?;
|
||||
|
||||
// Phase 6: Release the InProgress on the parent shard
|
||||
@@ -1532,6 +1611,130 @@ impl TenantManager {
|
||||
|
||||
Ok(child_shards)
|
||||
}
|
||||
|
||||
/// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization
|
||||
/// to avoid the children downloading them again.
|
||||
///
|
||||
/// For each resident layer in the parent shard, we will hard link it into all of the child shards.
|
||||
async fn shard_split_hardlink(
|
||||
&self,
|
||||
parent_shard: &Tenant,
|
||||
child_shards: Vec<TenantShardId>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id());
|
||||
let (parent_timelines, parent_layers) = {
|
||||
let mut parent_layers = Vec::new();
|
||||
let timelines = parent_shard.timelines.lock().unwrap().clone();
|
||||
let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
|
||||
for timeline in timelines.values() {
|
||||
let timeline_layers = timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.resident_layers()
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
for layer in timeline_layers {
|
||||
let relative_path = layer
|
||||
.local_path()
|
||||
.strip_prefix(&parent_path)
|
||||
.context("Removing prefix from parent layer path")?;
|
||||
parent_layers.push(relative_path.to_owned());
|
||||
}
|
||||
}
|
||||
debug_assert!(
|
||||
!parent_layers.is_empty(),
|
||||
"shutdown cannot empty the layermap"
|
||||
);
|
||||
(parent_timelines, parent_layers)
|
||||
};
|
||||
|
||||
let mut child_prefixes = Vec::new();
|
||||
let mut create_dirs = Vec::new();
|
||||
|
||||
for child in child_shards {
|
||||
let child_prefix = self.conf.tenant_path(&child);
|
||||
create_dirs.push(child_prefix.clone());
|
||||
create_dirs.extend(
|
||||
parent_timelines
|
||||
.iter()
|
||||
.map(|t| self.conf.timeline_path(&child, t)),
|
||||
);
|
||||
|
||||
child_prefixes.push(child_prefix);
|
||||
}
|
||||
|
||||
// Since we will do a large number of small filesystem metadata operations, batch them into
|
||||
// spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
|
||||
let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
|
||||
for dir in &create_dirs {
|
||||
if let Err(e) = std::fs::create_dir_all(dir) {
|
||||
// Ignore AlreadyExists errors, drop out on all other errors
|
||||
match e.kind() {
|
||||
std::io::ErrorKind::AlreadyExists => {}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(e).context(format!("Creating {dir}")));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for child_prefix in child_prefixes {
|
||||
for relative_layer in &parent_layers {
|
||||
let parent_path = parent_path.join(relative_layer);
|
||||
let child_path = child_prefix.join(relative_layer);
|
||||
if let Err(e) = std::fs::hard_link(&parent_path, &child_path) {
|
||||
match e.kind() {
|
||||
std::io::ErrorKind::AlreadyExists => {}
|
||||
std::io::ErrorKind::NotFound => {
|
||||
tracing::info!(
|
||||
"Layer {} not found during hard-linking, evicted during split?",
|
||||
relative_layer
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!(e).context(format!(
|
||||
"Hard linking {relative_layer} into {child_prefix}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Durability is not required for correctness, but if we crashed during split and
|
||||
// then came restarted with empty timeline dirs, it would be very inefficient to
|
||||
// re-populate from remote storage.
|
||||
for dir in create_dirs {
|
||||
if let Err(e) = crashsafe::fsync(&dir) {
|
||||
// Something removed a newly created timeline dir out from underneath us? Extremely
|
||||
// unexpected, but not worth panic'ing over as this whole function is just an
|
||||
// optimization.
|
||||
tracing::warn!("Failed to fsync directory {dir}: {e}")
|
||||
}
|
||||
}
|
||||
|
||||
Ok(parent_layers.len())
|
||||
});
|
||||
|
||||
match jh.await {
|
||||
Ok(Ok(layer_count)) => {
|
||||
tracing::info!(count = layer_count, "Hard linked layers into child shards");
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
// This is an optimization, so we tolerate failure.
|
||||
tracing::warn!("Error hard-linking layers, proceeding anyway: {e}")
|
||||
}
|
||||
Err(e) => {
|
||||
// This is something totally unexpected like a panic, so bail out.
|
||||
anyhow::bail!("Error joining hard linking task: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
||||
@@ -196,14 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use std::ops::DerefMut;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
@@ -263,11 +261,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not
|
||||
/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
|
||||
pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
||||
pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted(IndexPart),
|
||||
@@ -331,40 +324,6 @@ pub struct RemoteTimelineClient {
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
|
||||
///
|
||||
/// This is a convenience for the various upload functions. In future
|
||||
/// the anyhow::Error result should be replaced with a more structured type that
|
||||
/// enables callers to avoid handling shutdown as an error.
|
||||
async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<()>>,
|
||||
{
|
||||
match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
|
||||
Ok(Ok(())) => Ok(()),
|
||||
Ok(Err(e)) => Err(e),
|
||||
Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
|
||||
Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
|
||||
}
|
||||
}
|
||||
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
|
||||
async fn download_cancellable<F, R>(
|
||||
cancel: &CancellationToken,
|
||||
future: F,
|
||||
) -> Result<R, DownloadError>
|
||||
where
|
||||
F: std::future::Future<Output = Result<R, DownloadError>>,
|
||||
{
|
||||
match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
|
||||
Ok(Ok(r)) => Ok(r),
|
||||
Ok(Err(e)) => Err(e),
|
||||
Err(TimeoutCancellableError::Timeout) => {
|
||||
Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
|
||||
}
|
||||
Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimelineClient {
|
||||
///
|
||||
/// Create a remote storage client for given timeline
|
||||
@@ -655,7 +614,7 @@ impl RemoteTimelineClient {
|
||||
metadata,
|
||||
);
|
||||
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
|
||||
|
||||
@@ -695,7 +654,7 @@ impl RemoteTimelineClient {
|
||||
metadata.generation, metadata.shard
|
||||
);
|
||||
let op = UploadOp::UploadLayer(layer, metadata);
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
|
||||
@@ -864,10 +823,14 @@ impl RemoteTimelineClient {
|
||||
}
|
||||
|
||||
// schedule the actual deletions
|
||||
if with_metadata.is_empty() {
|
||||
// avoid scheduling the op & bumping the metric
|
||||
return;
|
||||
}
|
||||
let op = UploadOp::Delete(Delete {
|
||||
layers: with_metadata,
|
||||
});
|
||||
self.calls_unfinished_metric_begin(&op);
|
||||
self.metric_begin(&op);
|
||||
upload_queue.queued_operations.push_back(op);
|
||||
}
|
||||
|
||||
@@ -1050,7 +1013,7 @@ impl RemoteTimelineClient {
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)?;
|
||||
|
||||
// all good, disarm the guard and mark as success
|
||||
@@ -1082,14 +1045,14 @@ impl RemoteTimelineClient {
|
||||
upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"preserve_initdb_tar_zst",
|
||||
&cancel.clone(),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancellled"))
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("backing up initdb archive")?;
|
||||
Ok(())
|
||||
@@ -1151,7 +1114,7 @@ impl RemoteTimelineClient {
|
||||
let remaining = download_retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
.list_files(Some(&timeline_storage_path), None)
|
||||
.list_files(Some(&timeline_storage_path), None, &cancel)
|
||||
.await
|
||||
},
|
||||
"list remaining files",
|
||||
@@ -1445,6 +1408,10 @@ impl RemoteTimelineClient {
|
||||
Ok(()) => {
|
||||
break;
|
||||
}
|
||||
Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => {
|
||||
// loop around to do the proper stopping
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
let retries = task.retries.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
@@ -1553,10 +1520,10 @@ impl RemoteTimelineClient {
|
||||
.await;
|
||||
}
|
||||
|
||||
self.calls_unfinished_metric_end(&task.op);
|
||||
self.metric_end(&task.op);
|
||||
}
|
||||
|
||||
fn calls_unfinished_metric_impl(
|
||||
fn metric_impl(
|
||||
&self,
|
||||
op: &UploadOp,
|
||||
) -> Option<(
|
||||
@@ -1593,17 +1560,17 @@ impl RemoteTimelineClient {
|
||||
Some(res)
|
||||
}
|
||||
|
||||
fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
|
||||
fn metric_begin(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
|
||||
Some(x) => x,
|
||||
None => return,
|
||||
};
|
||||
let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
|
||||
guard.will_decrement_manually(); // in unfinished_ops_metric_end()
|
||||
guard.will_decrement_manually(); // in metric_end(), see right below
|
||||
}
|
||||
|
||||
fn calls_unfinished_metric_end(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
|
||||
fn metric_end(&self, op: &UploadOp) {
|
||||
let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
|
||||
Some(x) => x,
|
||||
None => return,
|
||||
};
|
||||
@@ -1688,7 +1655,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Tear down queued ops
|
||||
for op in qi.queued_operations.into_iter() {
|
||||
self.calls_unfinished_metric_end(&op);
|
||||
self.metric_end(&op);
|
||||
// Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
|
||||
// which is exactly what we want to happen.
|
||||
drop(op);
|
||||
|
||||
@@ -11,16 +11,14 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::fs::{self, File, OpenOptions};
|
||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
use utils::timeout::timeout_cancellable;
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
@@ -83,60 +81,28 @@ pub async fn download_layer_file<'a>(
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
|
||||
// file: the write to local file doesn't start until after the request header is returned
|
||||
// and we start draining the body stream below
|
||||
let download = download_cancellable(cancel, storage.download(&remote_path))
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
let download = storage.download(&remote_path, cancel).await?;
|
||||
|
||||
let mut destination_file =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
// Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
|
||||
// and we will unlink the temporary file if there is an error. This unlink is important because we
|
||||
// are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
|
||||
// we will imminiently try and write to again.
|
||||
let bytes_amount: u64 = match timeout_cancellable(
|
||||
DOWNLOAD_TIMEOUT,
|
||||
cancel,
|
||||
tokio::io::copy_buf(&mut reader, &mut destination_file),
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
Ok(b) => Ok(b),
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
|
||||
|
||||
match bytes_amount {
|
||||
Ok(bytes_amount) => {
|
||||
let destination_file = destination_file.into_inner();
|
||||
Ok((destination_file, bytes_amount))
|
||||
}
|
||||
Err(e) => {
|
||||
// Remove incomplete files: on restart Timeline would do this anyway, but we must
|
||||
// do it here for the retry case.
|
||||
if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
|
||||
}
|
||||
Err(e)
|
||||
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let destination_file = destination_file.into_inner();
|
||||
|
||||
Ok((destination_file, bytes_amount))
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
@@ -218,9 +184,11 @@ pub async fn list_remote_timelines(
|
||||
|
||||
let listing = download_retry_forever(
|
||||
|| {
|
||||
download_cancellable(
|
||||
storage.list(
|
||||
Some(&remote_path),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
|
||||
)
|
||||
},
|
||||
&format!("list timelines for {tenant_shard_id}"),
|
||||
@@ -259,26 +227,20 @@ async fn do_download_index_part(
|
||||
index_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let index_part_bytes = download_retry_forever(
|
||||
|| async {
|
||||
// Cancellation: if is safe to cancel this future because we're just downloading into
|
||||
// a memory buffer, not touching local disk.
|
||||
let index_part_download =
|
||||
download_cancellable(cancel, storage.download(&remote_path)).await?;
|
||||
let download = storage.download(&remote_path, cancel).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
let mut stream = std::pin::pin!(index_part_download.download_stream);
|
||||
while let Some(chunk) = stream.next().await {
|
||||
let chunk = chunk
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
index_part_bytes.extend_from_slice(&chunk[..]);
|
||||
}
|
||||
Ok(index_part_bytes)
|
||||
let mut bytes = Vec::new();
|
||||
|
||||
let stream = download.download_stream;
|
||||
let mut stream = StreamReader::new(stream);
|
||||
|
||||
tokio::io::copy_buf(&mut stream, &mut bytes).await?;
|
||||
|
||||
Ok(bytes)
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
@@ -373,7 +335,7 @@ pub(super) async fn download_index_part(
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
|
||||
let indices = download_retry(
|
||||
|| async { storage.list_files(Some(&index_prefix), None).await },
|
||||
|| async { storage.list_files(Some(&index_prefix), None, cancel).await },
|
||||
"list index_part files",
|
||||
cancel,
|
||||
)
|
||||
@@ -446,24 +408,17 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = match download_cancellable(cancel, storage.download(&remote_path)).await
|
||||
{
|
||||
let download = match storage.download(&remote_path, cancel).await {
|
||||
Ok(dl) => dl,
|
||||
Err(DownloadError::NotFound) => {
|
||||
download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
|
||||
storage.download(&remote_preserved_path, cancel).await?
|
||||
}
|
||||
Err(other) => Err(other)?,
|
||||
};
|
||||
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
|
||||
|
||||
// TODO: this consumption of the response body should be subject to timeout + cancellation, but
|
||||
// not without thinking carefully about how to recover safely from cancelling a write to
|
||||
// local storage (e.g. by writing into a temp file as we do in download_layer)
|
||||
tokio::io::copy_buf(&mut download, &mut writer)
|
||||
.await
|
||||
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
tokio::io::copy_buf(&mut download, &mut writer).await?;
|
||||
|
||||
let mut file = writer.into_inner();
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::{
|
||||
config::PageServerConf,
|
||||
tenant::remote_timeline_client::{
|
||||
index::IndexPart, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
|
||||
remote_initdb_preserved_archive_path, remote_path,
|
||||
},
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, TimeTravelError};
|
||||
@@ -49,16 +49,15 @@ pub(crate) async fn upload_index_part<'a>(
|
||||
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
|
||||
upload_cancellable(
|
||||
cancel,
|
||||
storage.upload_storage_object(
|
||||
storage
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
|
||||
index_part_size,
|
||||
&remote_path,
|
||||
),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
/// Attempts to upload given layer files.
|
||||
@@ -115,11 +114,10 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
|
||||
storage
|
||||
.upload(reader, fs_size, &storage_path, None, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
|
||||
|
||||
Ok(())
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))
|
||||
}
|
||||
|
||||
/// Uploads the given `initdb` data to the remote storage.
|
||||
@@ -139,12 +137,10 @@ pub(crate) async fn upload_initdb_dir(
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
|
||||
|
||||
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(
|
||||
cancel,
|
||||
storage.upload_storage_object(file, size as usize, &remote_path),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
storage
|
||||
.upload_storage_object(file, size as usize, &remote_path, cancel)
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(
|
||||
@@ -155,7 +151,8 @@ pub(crate) async fn preserve_initdb_archive(
|
||||
) -> anyhow::Result<()> {
|
||||
let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
|
||||
upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
|
||||
storage
|
||||
.copy_object(&source_path, &dest_path, cancel)
|
||||
.await
|
||||
.with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
@@ -133,7 +133,7 @@ impl SecondaryTenant {
|
||||
}
|
||||
|
||||
pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
|
||||
*(self.tenant_conf.lock().unwrap()) = *config;
|
||||
*(self.tenant_conf.lock().unwrap()) = config.clone();
|
||||
}
|
||||
|
||||
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
||||
@@ -144,13 +144,13 @@ impl SecondaryTenant {
|
||||
|
||||
let conf = models::LocationConfigSecondary { warm: conf.warm };
|
||||
|
||||
let tenant_conf = *self.tenant_conf.lock().unwrap();
|
||||
let tenant_conf = self.tenant_conf.lock().unwrap().clone();
|
||||
models::LocationConfig {
|
||||
mode: models::LocationConfigMode::Secondary,
|
||||
generation: None,
|
||||
secondary_conf: Some(conf),
|
||||
shard_number: self.tenant_shard_id.shard_number.0,
|
||||
shard_count: self.tenant_shard_id.shard_count.0,
|
||||
shard_count: self.tenant_shard_id.shard_count.literal(),
|
||||
shard_stripe_size: self.shard_identity.stripe_size.0,
|
||||
tenant_conf: tenant_conf.into(),
|
||||
}
|
||||
|
||||
@@ -37,6 +37,7 @@ use crate::tenant::{
|
||||
remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
|
||||
};
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -44,7 +45,7 @@ use rand::Rng;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
||||
};
|
||||
@@ -438,8 +439,14 @@ impl From<std::io::Error> for UpdateError {
|
||||
fn from(value: std::io::Error) -> Self {
|
||||
if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
|
||||
UpdateError::NoSpace
|
||||
} else if value
|
||||
.get_ref()
|
||||
.and_then(|x| x.downcast_ref::<DownloadError>())
|
||||
.is_some()
|
||||
{
|
||||
UpdateError::from(DownloadError::from(value))
|
||||
} else {
|
||||
// An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
|
||||
// An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue
|
||||
UpdateError::Other(anyhow::anyhow!(value))
|
||||
}
|
||||
}
|
||||
@@ -523,12 +530,13 @@ impl<'a> TenantDownloader<'a> {
|
||||
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||
|
||||
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||
let cancel = &self.secondary_state.cancel;
|
||||
|
||||
let heatmap_bytes = backoff::retry(
|
||||
|| async {
|
||||
let download = self
|
||||
.remote_storage
|
||||
.download(&heatmap_path)
|
||||
.download(&heatmap_path, cancel)
|
||||
.await
|
||||
.map_err(UpdateError::from)?;
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
@@ -540,7 +548,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"download heatmap",
|
||||
&self.secondary_state.cancel,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| UpdateError::Cancelled)
|
||||
@@ -671,20 +679,17 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await
|
||||
{
|
||||
Ok(bytes) => bytes,
|
||||
Err(e) => {
|
||||
if let DownloadError::NotFound = e {
|
||||
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||
// This is harmless: continue to download the next layer. It is expected during compaction
|
||||
// GC.
|
||||
tracing::debug!(
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
continue;
|
||||
} else {
|
||||
return Err(e.into());
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||
// This is harmless: continue to download the next layer. It is expected during compaction
|
||||
// GC.
|
||||
tracing::debug!(
|
||||
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||
layer.name
|
||||
);
|
||||
continue;
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
if downloaded_bytes != layer.metadata.file_size {
|
||||
@@ -774,19 +779,33 @@ async fn init_timeline_state(
|
||||
.await
|
||||
.fatal_err(&format!("Listing {timeline_path}"))
|
||||
{
|
||||
let dentry_file_name = dentry.file_name();
|
||||
let file_name = dentry_file_name.to_string_lossy();
|
||||
let local_meta = dentry.metadata().await.fatal_err(&format!(
|
||||
"Read metadata on {}",
|
||||
dentry.path().to_string_lossy()
|
||||
));
|
||||
let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else {
|
||||
tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy());
|
||||
continue;
|
||||
};
|
||||
let local_meta = dentry
|
||||
.metadata()
|
||||
.await
|
||||
.fatal_err(&format!("Read metadata on {}", file_path));
|
||||
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
let file_name = file_path.file_name().expect("created it from the dentry");
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
||||
continue;
|
||||
} else if crate::is_temporary(&file_path) {
|
||||
// Temporary files are frequently left behind from restarting during downloads
|
||||
tracing::info!("Cleaning up temporary file {file_path}");
|
||||
if let Err(e) = tokio::fs::remove_file(&file_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
{
|
||||
tracing::error!("Failed to remove temporary file {file_path}: {e}");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
match LayerFileName::from_str(&file_name) {
|
||||
match LayerFileName::from_str(file_name) {
|
||||
Ok(name) => {
|
||||
let remote_meta = heatmap_metadata.get(&name);
|
||||
match remote_meta {
|
||||
|
||||
@@ -21,18 +21,17 @@ use futures::Future;
|
||||
use md5;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
|
||||
|
||||
use super::{
|
||||
heatmap::HeatMapTenant,
|
||||
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
|
||||
CommandRequest,
|
||||
CommandRequest, UploadCommand,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, Instrument};
|
||||
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
|
||||
|
||||
use super::{heatmap::HeatMapTenant, UploadCommand};
|
||||
|
||||
pub(super) async fn heatmap_uploader_task(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
@@ -417,10 +416,10 @@ async fn upload_tenant_heatmap(
|
||||
|| async {
|
||||
let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
|
||||
remote_storage
|
||||
.upload_storage_object(bytes, size, &path)
|
||||
.upload_storage_object(bytes, size, &path, cancel)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
3,
|
||||
u32::MAX,
|
||||
"Uploading heatmap",
|
||||
|
||||
@@ -8,15 +8,21 @@ pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::models::{
|
||||
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::ops::Range;
|
||||
use std::sync::Mutex;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
@@ -34,6 +40,11 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
|
||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
use super::layer_map::InMemoryLayerHandle;
|
||||
use super::timeline::layer_manager::LayerManager;
|
||||
use super::timeline::GetVectoredError;
|
||||
use super::PageReconstructError;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
@@ -67,6 +78,277 @@ pub struct ValueReconstructState {
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
|
||||
pub(crate) enum ValueReconstructSituation {
|
||||
Complete,
|
||||
#[default]
|
||||
Continue,
|
||||
}
|
||||
|
||||
/// Reconstruct data accumulated for a single key during a vectored get
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub(crate) struct VectoredValueReconstructState {
|
||||
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub(crate) img: Option<(Lsn, Bytes)>,
|
||||
|
||||
situation: ValueReconstructSituation,
|
||||
}
|
||||
|
||||
impl VectoredValueReconstructState {
|
||||
fn get_cached_lsn(&self) -> Option<Lsn> {
|
||||
self.img.as_ref().map(|img| img.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<VectoredValueReconstructState> for ValueReconstructState {
|
||||
fn from(mut state: VectoredValueReconstructState) -> Self {
|
||||
// walredo expects the records to be descending in terms of Lsn
|
||||
state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
|
||||
|
||||
ValueReconstructState {
|
||||
records: state.records,
|
||||
img: state.img,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Bag of data accumulated during a vectored get
|
||||
pub(crate) struct ValuesReconstructState {
|
||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Associate a key with the error which it encountered and mark it as done
|
||||
pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
|
||||
let previous = self.keys.insert(key, Err(err));
|
||||
if let Some(Ok(state)) = previous {
|
||||
if state.situation == ValueReconstructSituation::Continue {
|
||||
self.keys_done.add_key(key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the state collected for a given key.
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
/// If the key is done after the update, mark it as such.
|
||||
pub(crate) fn update_key(
|
||||
&mut self,
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
value: Value,
|
||||
) -> ValueReconstructSituation {
|
||||
let state = self
|
||||
.keys
|
||||
.entry(*key)
|
||||
.or_insert(Ok(VectoredValueReconstructState::default()));
|
||||
|
||||
if let Ok(state) = state {
|
||||
let key_done = match state.situation {
|
||||
ValueReconstructSituation::Complete => unreachable!(),
|
||||
ValueReconstructSituation::Continue => match value {
|
||||
Value::Image(img) => {
|
||||
state.img = Some((lsn, img));
|
||||
true
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let reached_cache =
|
||||
state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
|
||||
let will_init = rec.will_init();
|
||||
state.records.push((lsn, rec));
|
||||
will_init || reached_cache
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
if key_done && state.situation == ValueReconstructSituation::Continue {
|
||||
state.situation = ValueReconstructSituation::Complete;
|
||||
self.keys_done.add_key(*key);
|
||||
}
|
||||
|
||||
state.situation
|
||||
} else {
|
||||
ValueReconstructSituation::Complete
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the Lsn at which this key is cached if one exists.
|
||||
/// The read path should go no further than this Lsn for the given key.
|
||||
pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
|
||||
self.keys
|
||||
.get(key)
|
||||
.and_then(|k| k.as_ref().ok())
|
||||
.and_then(|state| state.get_cached_lsn())
|
||||
}
|
||||
|
||||
/// Returns the key space describing the keys that have
|
||||
/// been marked as completed since the last call to this function.
|
||||
pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
|
||||
self.keys_done.consume_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ValuesReconstructState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Description of layer to be read - the layer map can turn
|
||||
/// this description into the actual layer.
|
||||
#[derive(PartialEq, Eq, Hash, Debug, Clone)]
|
||||
pub(crate) enum ReadableLayerDesc {
|
||||
Persistent {
|
||||
desc: PersistentLayerDesc,
|
||||
lsn_floor: Lsn,
|
||||
lsn_ceil: Lsn,
|
||||
},
|
||||
InMemory {
|
||||
handle: InMemoryLayerHandle,
|
||||
lsn_ceil: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
/// Wraper for 'ReadableLayerDesc' sorted by Lsn
|
||||
#[derive(Debug)]
|
||||
struct ReadableLayerDescOrdered(ReadableLayerDesc);
|
||||
|
||||
/// Data structure which maintains a fringe of layers for the
|
||||
/// read path. The fringe is the set of layers which intersects
|
||||
/// the current keyspace that the search is descending on.
|
||||
/// Each layer tracks the keyspace that intersects it.
|
||||
///
|
||||
/// The fringe must appear sorted by Lsn. Hence, it uses
|
||||
/// a two layer indexing scheme.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LayerFringe {
|
||||
layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
|
||||
layers: HashMap<ReadableLayerDesc, KeySpace>,
|
||||
}
|
||||
|
||||
impl LayerFringe {
|
||||
pub(crate) fn new() -> Self {
|
||||
LayerFringe {
|
||||
layers_by_lsn: BinaryHeap::new(),
|
||||
layers: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
|
||||
let handle = match self.layers_by_lsn.pop() {
|
||||
Some(h) => h,
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let removed = self.layers.remove_entry(&handle.0);
|
||||
match removed {
|
||||
Some((layer, keyspace)) => Some((layer, keyspace)),
|
||||
None => unreachable!("fringe internals are always consistent"),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
|
||||
let entry = self.layers.entry(layer.clone());
|
||||
match entry {
|
||||
Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().merge(&keyspace);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
self.layers_by_lsn
|
||||
.push(ReadableLayerDescOrdered(entry.key().clone()));
|
||||
entry.insert(keyspace);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LayerFringe {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for ReadableLayerDescOrdered {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
|
||||
if ord == std::cmp::Ordering::Equal {
|
||||
self.0
|
||||
.get_lsn_floor()
|
||||
.cmp(&other.0.get_lsn_floor())
|
||||
.reverse()
|
||||
} else {
|
||||
ord
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for ReadableLayerDescOrdered {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for ReadableLayerDescOrdered {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.get_lsn_floor() == other.0.get_lsn_floor()
|
||||
&& self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for ReadableLayerDescOrdered {}
|
||||
|
||||
impl ReadableLayerDesc {
|
||||
pub(crate) fn get_lsn_floor(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
|
||||
ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_lsn_ceil(&self) -> Lsn {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
|
||||
ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
layer_manager: &LayerManager,
|
||||
keyspace: KeySpace,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
match self {
|
||||
ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
|
||||
let layer = layer_manager.get_from_desc(desc);
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
|
||||
let layer = layer_manager
|
||||
.layer_map()
|
||||
.get_in_memory_layer(handle)
|
||||
.unwrap();
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value from [`Layer::get_value_reconstruct_data`]
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum ValueReconstructResult {
|
||||
|
||||
@@ -35,16 +35,19 @@ use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::File;
|
||||
use std::io::SeekFrom;
|
||||
use std::ops::Range;
|
||||
@@ -59,7 +62,10 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -818,6 +824,133 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
// If the key is cached, go no further than the cached Lsn.
|
||||
//
|
||||
// Currently, the index is visited for each range, but this
|
||||
// can be further optimised to visit the index only once.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
end_lsn: Lsn,
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut ignore_key = None;
|
||||
|
||||
// Scan the page versions backwards, starting from the last key in the range.
|
||||
// to collect all the offsets at which need to be read.
|
||||
let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
|
||||
tree_reader
|
||||
.visit(
|
||||
&end_key.0,
|
||||
VisitDirection::Backwards,
|
||||
|raw_key, value| {
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
|
||||
|
||||
if entry_lsn >= end_lsn {
|
||||
return true;
|
||||
}
|
||||
|
||||
if key < range.start {
|
||||
return false;
|
||||
}
|
||||
|
||||
if key >= range.end {
|
||||
return true;
|
||||
}
|
||||
|
||||
if Some(key) == ignore_key {
|
||||
return true;
|
||||
}
|
||||
|
||||
if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
|
||||
if entry_lsn <= cached_lsn {
|
||||
return key != range.start;
|
||||
}
|
||||
}
|
||||
|
||||
let blob_ref = BlobRef(value);
|
||||
let lsns_at = offsets.entry(key).or_default();
|
||||
lsns_at.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
if blob_ref.will_init() {
|
||||
if key == range.start {
|
||||
return false;
|
||||
} else {
|
||||
ignore_key = Some(key);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
|
||||
}
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerValue)
|
||||
.build();
|
||||
|
||||
let cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, lsns_at) in offsets {
|
||||
for (lsn, block_offset) in lsns_at {
|
||||
let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
|
||||
|
||||
if let Err(e) = res {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path
|
||||
))),
|
||||
);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
let value = Value::des(&buf);
|
||||
if let Err(e) = value {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::from(anyhow!(e).context(format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path
|
||||
))),
|
||||
);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys<'a>(
|
||||
&'a self,
|
||||
ctx: &RequestContext,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user