mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-02 21:10:38 +00:00
Compare commits
143 Commits
revert-865
...
hack/fast-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2bf9a350ef | ||
|
|
2e7e5f4f3a | ||
|
|
9f3d5826be | ||
|
|
4f39501641 | ||
|
|
67d1606f82 | ||
|
|
9351ba26ff | ||
|
|
8df388330b | ||
|
|
357c07dd35 | ||
|
|
7b90ec6e19 | ||
|
|
85f4e966e8 | ||
|
|
4d27048d6d | ||
|
|
3a452d8f56 | ||
|
|
b81dbc887b | ||
|
|
80fed9cfb1 | ||
|
|
189386b22f | ||
|
|
38dfecb026 | ||
|
|
be28bd8312 | ||
|
|
9759d6ec72 | ||
|
|
0c64d55a6b | ||
|
|
578da1dc02 | ||
|
|
842ac7cfda | ||
|
|
71340e3c00 | ||
|
|
e6e0b27dc3 | ||
|
|
04ec8bd7de | ||
|
|
6563be1a4c | ||
|
|
fe975acc71 | ||
|
|
abed35589b | ||
|
|
3fe8b69968 | ||
|
|
0c856443c4 | ||
|
|
0fc584ef9a | ||
|
|
daedec65ac | ||
|
|
94c393bf8f | ||
|
|
28616b0907 | ||
|
|
241724f3fc | ||
|
|
98d128d993 | ||
|
|
b37da32c6f | ||
|
|
3b317cae07 | ||
|
|
bf0531d107 | ||
|
|
15e90cc427 | ||
|
|
9746b6ea31 | ||
|
|
516ac0591e | ||
|
|
3ec785f30d | ||
|
|
05caaab850 | ||
|
|
cacb1ae333 | ||
|
|
df971f995c | ||
|
|
e58e045ebb | ||
|
|
20f82f9169 | ||
|
|
72aa6b02da | ||
|
|
022fad65eb | ||
|
|
8eaa8ad358 | ||
|
|
653a6532a2 | ||
|
|
18bfc43fa7 | ||
|
|
7ce49fe6e3 | ||
|
|
a8fbc63be2 | ||
|
|
96b5c4d33d | ||
|
|
c7481402a0 | ||
|
|
a644f01b6a | ||
|
|
c2f8fdccd7 | ||
|
|
cfa45ff5ee | ||
|
|
acc075071d | ||
|
|
9627747d35 | ||
|
|
63a0d0d039 | ||
|
|
793b5061ec | ||
|
|
a889a49e06 | ||
|
|
5eb7322d08 | ||
|
|
c0ba18a112 | ||
|
|
992a951b5e | ||
|
|
c5ef779801 | ||
|
|
2d10306f7a | ||
|
|
9b9f90c562 | ||
|
|
52cb33770b | ||
|
|
12850dd5e9 | ||
|
|
5d527133a3 | ||
|
|
09362b6363 | ||
|
|
7820c572e7 | ||
|
|
bf03713fa1 | ||
|
|
0f65684263 | ||
|
|
97241776aa | ||
|
|
2dd53e7ae0 | ||
|
|
d6eede515a | ||
|
|
d48229f50f | ||
|
|
cdfdcd3e5d | ||
|
|
06795c6b9a | ||
|
|
701cb61b57 | ||
|
|
0aa1450936 | ||
|
|
b65a95f12e | ||
|
|
c1cb7a0fa0 | ||
|
|
f4cac1f30f | ||
|
|
612b643315 | ||
|
|
bcc68a7866 | ||
|
|
73286e6b9f | ||
|
|
bc8cfe1b55 | ||
|
|
6a74bcadec | ||
|
|
e62cd9e121 | ||
|
|
e80ab8fd6a | ||
|
|
d8ca495eae | ||
|
|
dbdb8a1187 | ||
|
|
f7ab3ffcb7 | ||
|
|
2f8d548a12 | ||
|
|
66db381dc9 | ||
|
|
6744ed19d8 | ||
|
|
ae63ac7488 | ||
|
|
6eb638f4b3 | ||
|
|
7a485b599b | ||
|
|
b1c457898b | ||
|
|
1a9d559be8 | ||
|
|
0e6c0d47a5 | ||
|
|
d645645fab | ||
|
|
7c74112b2a | ||
|
|
a968554a8c | ||
|
|
07b7c63975 | ||
|
|
04752dfa75 | ||
|
|
99c19cad24 | ||
|
|
b83d722369 | ||
|
|
d919770c55 | ||
|
|
f4b3c317f3 | ||
|
|
428b105dde | ||
|
|
75175f3628 | ||
|
|
3b8016488e | ||
|
|
477246f42c | ||
|
|
21b684718e | ||
|
|
6d8572ded6 | ||
|
|
c8b9116a97 | ||
|
|
beefc7a810 | ||
|
|
fa0750a37e | ||
|
|
0170611a97 | ||
|
|
1c96957e85 | ||
|
|
02a28c01ca | ||
|
|
c96593b473 | ||
|
|
ef57e73fbf | ||
|
|
4c5a0fdc75 | ||
|
|
4b26783c94 | ||
|
|
6949b45e17 | ||
|
|
3b8ca477ab | ||
|
|
eb7241c798 | ||
|
|
f246aa3ca7 | ||
|
|
188bde7f07 | ||
|
|
7131ac4730 | ||
|
|
2be69af6c3 | ||
|
|
c6b6b7700a | ||
|
|
e2d89f7991 | ||
|
|
25e7d321f4 | ||
|
|
3f91ea28d9 |
@@ -23,10 +23,30 @@ platforms = [
|
||||
]
|
||||
|
||||
[final-excludes]
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
workspace-members = ["vm_monitor"]
|
||||
workspace-members = [
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
"vm_monitor",
|
||||
# All of these exist in libs and are not usually built independently.
|
||||
# Putting workspace hack there adds a bottleneck for cargo builds.
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"desim",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"remote_storage",
|
||||
"safekeeper_api",
|
||||
"tenant_size_model",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
"wal_craft",
|
||||
"walproposer",
|
||||
]
|
||||
|
||||
# Write out exact versions rather than a semver range. (Defaults to false.)
|
||||
# exact-versions = true
|
||||
|
||||
6
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
6
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
blank_issues_enabled: true
|
||||
contact_links:
|
||||
- name: Feature request
|
||||
url: https://console.neon.tech/app/projects?modal=feedback
|
||||
about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`
|
||||
14
.github/actions/run-python-test-set/action.yml
vendored
14
.github/actions/run-python-test-set/action.yml
vendored
@@ -43,7 +43,7 @@ inputs:
|
||||
pg_version:
|
||||
description: 'Postgres version to use for tests'
|
||||
required: false
|
||||
default: 'v14'
|
||||
default: 'v16'
|
||||
benchmark_durations:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
@@ -71,7 +71,7 @@ runs:
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
|
||||
prefix: latest
|
||||
# The lack of compatibility snapshot (for example, for the new Postgres version)
|
||||
@@ -169,10 +169,8 @@ runs:
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
cov_prefix=()
|
||||
else
|
||||
cov_prefix=()
|
||||
fi
|
||||
@@ -213,13 +211,13 @@ runs:
|
||||
fi
|
||||
|
||||
- name: Upload compatibility snapshot
|
||||
if: github.ref_name == 'release'
|
||||
# Note, that we use `github.base_ref` which is a target branch for a PR
|
||||
if: github.event_name == 'pull_request' && github.base_ref == 'release'
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
|
||||
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
|
||||
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
|
||||
prefix: latest
|
||||
|
||||
- name: Upload test results
|
||||
if: ${{ !cancelled() }}
|
||||
|
||||
@@ -48,6 +48,8 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
|
||||
23
.github/workflows/_build-and-test-locally.yml
vendored
23
.github/workflows/_build-and-test-locally.yml
vendored
@@ -94,11 +94,16 @@ jobs:
|
||||
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
||||
# corresponding Cargo.toml files for their descriptions.
|
||||
- name: Set env variables
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release"
|
||||
@@ -158,6 +163,8 @@ jobs:
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
# binaries with different features/flags.
|
||||
- name: Install rust binaries
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
# Install target binaries
|
||||
mkdir -p /tmp/neon/bin/
|
||||
@@ -172,7 +179,7 @@ jobs:
|
||||
done
|
||||
|
||||
# Install test executables and write list of all binaries (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
# Keep bloated coverage data files away from the rest of the artifact
|
||||
mkdir -p /tmp/coverage/
|
||||
|
||||
@@ -209,8 +216,14 @@ jobs:
|
||||
#nextest does not yet support running doctests
|
||||
${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# run all non-pageserver tests
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
|
||||
|
||||
# run pageserver tests with different settings
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
for io_buffer_alignment in 0 1 512 ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
@@ -243,8 +256,8 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
# Run test on x64 only
|
||||
if: inputs.arch == 'x64'
|
||||
# Don't run regression tests on debug arm64 builds
|
||||
if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
|
||||
needs: [ build-neon ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
container:
|
||||
|
||||
110
.github/workflows/build_and_test.yml
vendored
110
.github/workflows/build_and_test.yml
vendored
@@ -198,7 +198,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64 ]
|
||||
arch: [ x64, arm64 ]
|
||||
# Do not build or run tests in debug for release branches
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }}
|
||||
include:
|
||||
@@ -280,6 +280,7 @@ jobs:
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
|
||||
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
|
||||
pg_version: v16
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -985,10 +986,10 @@ jobs:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
@@ -998,14 +999,14 @@ jobs:
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-prod.yml --ref main \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
-f branch=main \
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}}
|
||||
elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f deployStorage=false \
|
||||
@@ -1015,7 +1016,7 @@ jobs:
|
||||
-f dockerTag=${{needs.tag.outputs.build-tag}} \
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
|
||||
gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=true \
|
||||
-f deployProxy=true \
|
||||
-f branch=main \
|
||||
@@ -1054,43 +1055,88 @@ jobs:
|
||||
generate_release_notes: true,
|
||||
})
|
||||
|
||||
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
||||
promote-compatibility-data:
|
||||
needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
|
||||
needs: [ deploy ]
|
||||
if: github.ref_name == 'release'
|
||||
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Promote compatibility snapshot for the release
|
||||
- name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR
|
||||
id: fetch-last-release-pr-info
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
branch_name_and_pr_number=$(gh pr list \
|
||||
--repo "${GITHUB_REPOSITORY}" \
|
||||
--base release \
|
||||
--state merged \
|
||||
--limit 10 \
|
||||
--json mergeCommit,headRefName,number \
|
||||
--jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }")
|
||||
branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name')
|
||||
pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number')
|
||||
|
||||
run_id=$(gh run list \
|
||||
--repo "${GITHUB_REPOSITORY}" \
|
||||
--workflow build_and_test.yml \
|
||||
--branch "${branch_name}" \
|
||||
--json databaseId \
|
||||
--limit 1 \
|
||||
--jq '.[].databaseId')
|
||||
|
||||
last_commit_sha=$(gh pr view "${pr_number}" \
|
||||
--repo "${GITHUB_REPOSITORY}" \
|
||||
--json commits \
|
||||
--jq '.commits[-1].oid')
|
||||
|
||||
echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT}
|
||||
echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT}
|
||||
|
||||
- name: Promote compatibility snapshot and Neon artifact
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/latest
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
AWS_REGION: eu-central-1
|
||||
COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }}
|
||||
RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }}
|
||||
run: |
|
||||
# Update compatibility snapshot for the release
|
||||
for pg_version in v14 v15 v16; do
|
||||
for build_type in debug release; do
|
||||
OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
|
||||
NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst
|
||||
old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}"
|
||||
new_prefix="artifacts/latest"
|
||||
|
||||
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
|
||||
files_to_promote=()
|
||||
files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true)
|
||||
|
||||
for arch in X64 ARM64; do
|
||||
for build_type in debug release; do
|
||||
neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst"
|
||||
s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${s3_key}" ]; then
|
||||
echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
files_to_promote+=("s3://${BUCKET}/${s3_key}")
|
||||
|
||||
for pg_version in v14 v15 v16; do
|
||||
# We run less tests for debug builds, so we don't need to promote them
|
||||
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
|
||||
continue
|
||||
fi
|
||||
|
||||
compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst"
|
||||
s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${s3_key}" ]; then
|
||||
echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
files_to_promote+=("s3://${BUCKET}/${s3_key}")
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
# Update Neon artifact for the release (reuse already uploaded artifact)
|
||||
for build_type in debug release; do
|
||||
OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
|
||||
FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
|
||||
for f in "${files_to_promote[@]}"; do
|
||||
time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/
|
||||
done
|
||||
|
||||
pin-build-tools-image:
|
||||
|
||||
66
Cargo.lock
generated
66
Cargo.lock
generated
@@ -936,6 +936,12 @@ dependencies = [
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit_field"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
@@ -1208,7 +1214,6 @@ dependencies = [
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1321,7 +1326,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1329,7 +1333,6 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"camino",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
@@ -1670,14 +1673,13 @@ dependencies = [
|
||||
"smallvec",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diesel"
|
||||
version = "2.2.1"
|
||||
version = "2.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
|
||||
checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"byteorder",
|
||||
@@ -2947,17 +2949,6 @@ version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "leaky-bucket"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
|
||||
dependencies = [
|
||||
"parking_lot 0.12.1",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.150"
|
||||
@@ -3147,7 +3138,6 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"twox-hash",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3687,6 +3677,7 @@ dependencies = [
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -3711,7 +3702,6 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"itertools 0.10.5",
|
||||
"leaky-bucket",
|
||||
"md5",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
@@ -3736,6 +3726,7 @@ dependencies = [
|
||||
"reqwest 0.12.4",
|
||||
"rpds",
|
||||
"scopeguard",
|
||||
"send-future",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_path_to_error",
|
||||
@@ -3791,7 +3782,6 @@ dependencies = [
|
||||
"strum_macros",
|
||||
"thiserror",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3799,7 +3789,6 @@ name = "pageserver_client"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
@@ -4134,7 +4123,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4147,7 +4136,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -4166,7 +4155,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4193,7 +4182,6 @@ dependencies = [
|
||||
"tokio-rustls 0.25.0",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4206,7 +4194,6 @@ dependencies = [
|
||||
"postgres",
|
||||
"tokio-postgres",
|
||||
"url",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4229,7 +4216,6 @@ dependencies = [
|
||||
"serde",
|
||||
"thiserror",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4267,7 +4253,6 @@ dependencies = [
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4832,7 +4817,6 @@ dependencies = [
|
||||
"toml_edit 0.19.10",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5357,7 +5341,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5466,6 +5449,12 @@ version = "1.0.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
|
||||
|
||||
[[package]]
|
||||
name = "send-future"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
|
||||
|
||||
[[package]]
|
||||
name = "sentry"
|
||||
version = "0.32.3"
|
||||
@@ -5601,11 +5590,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.96"
|
||||
version = "1.0.125"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
|
||||
checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
@@ -5960,7 +5950,6 @@ name = "storage_controller_client"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"futures",
|
||||
"pageserver_api",
|
||||
@@ -6193,7 +6182,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6422,7 +6410,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6794,7 +6782,6 @@ dependencies = [
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6965,7 +6952,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"arc-swap",
|
||||
"async-compression",
|
||||
"async-trait",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -6981,7 +6967,6 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"jsonwebtoken",
|
||||
"leaky-bucket",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
@@ -7012,7 +6997,6 @@ dependencies = [
|
||||
"url",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7091,7 +7075,6 @@ dependencies = [
|
||||
"postgres_ffi",
|
||||
"regex",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7112,7 +7095,6 @@ dependencies = [
|
||||
"bindgen",
|
||||
"postgres_ffi",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7669,8 +7651,6 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
"tokio-util",
|
||||
"toml_datetime",
|
||||
"toml_edit 0.19.10",
|
||||
"tonic",
|
||||
"tower",
|
||||
"tracing",
|
||||
|
||||
@@ -65,6 +65,7 @@ axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.65"
|
||||
bit_field = "0.10.2"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
bytes = "1.0"
|
||||
@@ -107,13 +108,12 @@ ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.8"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
@@ -145,6 +145,7 @@ rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
send-future = "0.1.0"
|
||||
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY patches/pg_hintplan.patch /ext-src
|
||||
COPY patches/pg_hint_plan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN patch -p1 </ext-src/pg_anon.patch
|
||||
RUN patch -p1 </ext-src/pg_cron.patch
|
||||
|
||||
@@ -126,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
|
||||
|
||||
|
||||
#### Running neon database
|
||||
@@ -262,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
|
||||
testing locally, it is convenient to run just one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info, warn};
|
||||
@@ -366,6 +367,8 @@ fn wait_spec(
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
launch_lsn_lease_bg_task_for_static(&compute);
|
||||
|
||||
Ok(WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
|
||||
@@ -11,6 +11,7 @@ pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod lsn_lease;
|
||||
mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
|
||||
186
compute_tools/src/lsn_lease.rs
Normal file
186
compute_tools/src/lsn_lease.rs
Normal file
@@ -0,0 +1,186 @@
|
||||
use anyhow::bail;
|
||||
use anyhow::Result;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use std::time::SystemTime;
|
||||
use std::{str::FromStr, sync::Arc, thread, time::Duration};
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use compute_api::spec::ComputeMode;
|
||||
use tracing::{info, warn};
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||
};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
/// Spawns a background thread to periodically renew LSN leases for static compute.
|
||||
/// Do nothing if the compute is not in static mode.
|
||||
pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
|
||||
let (tenant_id, timeline_id, lsn) = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let spec = state.pspec.as_ref().expect("Spec must be set");
|
||||
match spec.spec.mode {
|
||||
ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
|
||||
_ => return,
|
||||
}
|
||||
};
|
||||
let compute = compute.clone();
|
||||
|
||||
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
|
||||
thread::spawn(move || {
|
||||
let _entered = span.entered();
|
||||
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
|
||||
// TODO: might need stronger error feedback than logging an warning.
|
||||
warn!("Exited with error: {e}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Renews lsn lease periodically so static compute are not affected by GC.
|
||||
fn lsn_lease_bg_task(
|
||||
compute: Arc<ComputeNode>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
loop {
|
||||
let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
|
||||
let valid_duration = valid_until
|
||||
.duration_since(SystemTime::now())
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
// Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
|
||||
let sleep_duration = valid_duration
|
||||
.saturating_sub(Duration::from_secs(60))
|
||||
.max(valid_duration / 2);
|
||||
|
||||
info!(
|
||||
"Succeeded, sleeping for {} seconds",
|
||||
sleep_duration.as_secs()
|
||||
);
|
||||
thread::sleep(sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
|
||||
/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
|
||||
fn acquire_lsn_lease_with_retry(
|
||||
compute: &Arc<ComputeNode>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<SystemTime> {
|
||||
let mut attempts = 0usize;
|
||||
let mut retry_period_ms: f64 = 500.0;
|
||||
const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
|
||||
|
||||
loop {
|
||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||
let configs = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
|
||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||
|
||||
let conn_strings = spec.pageserver_connstr.split(',');
|
||||
|
||||
conn_strings
|
||||
.map(|connstr| {
|
||||
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
||||
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
||||
info!("Got storage auth token from spec file");
|
||||
config.password(storage_auth_token.clone());
|
||||
} else {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
config
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
|
||||
match result {
|
||||
Ok(Some(res)) => {
|
||||
return Ok(res);
|
||||
}
|
||||
Ok(None) => {
|
||||
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
|
||||
|
||||
thread::sleep(Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
|
||||
}
|
||||
}
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to acquire an LSN lease through PS page_service API.
|
||||
fn try_acquire_lsn_lease(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
configs: &[postgres::Config],
|
||||
) -> Result<Option<SystemTime>> {
|
||||
fn get_valid_until(
|
||||
config: &postgres::Config,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
|
||||
let res = client.simple_query(&cmd)?;
|
||||
let msg = match res.first() {
|
||||
Some(msg) => msg,
|
||||
None => bail!("empty response"),
|
||||
};
|
||||
let row = match msg {
|
||||
SimpleQueryMessage::Row(row) => row,
|
||||
_ => bail!("error parsing lsn lease response"),
|
||||
};
|
||||
|
||||
// Note: this will be None if a lease is explicitly not granted.
|
||||
let valid_until_str = row.get("valid_until");
|
||||
|
||||
let valid_until = valid_until_str.map(|s| {
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
|
||||
.expect("Time larger than max SystemTime could handle")
|
||||
});
|
||||
Ok(valid_until)
|
||||
}
|
||||
|
||||
let shard_count = configs.len();
|
||||
|
||||
let valid_until = if shard_count > 1 {
|
||||
configs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(shard_number, config)| {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: ShardCount::new(shard_count as u8),
|
||||
shard_number: ShardNumber(shard_number as u8),
|
||||
};
|
||||
get_valid_until(config, tenant_shard_id, timeline_id, lsn)
|
||||
})
|
||||
.collect::<Result<Vec<Option<SystemTime>>>>()?
|
||||
.into_iter()
|
||||
.min()
|
||||
.unwrap()
|
||||
} else {
|
||||
get_valid_until(
|
||||
&configs[0],
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?
|
||||
};
|
||||
|
||||
Ok(valid_until)
|
||||
}
|
||||
@@ -6,7 +6,6 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
|
||||
@@ -379,7 +379,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
match kill(pid, None) {
|
||||
// Process exists, keep waiting
|
||||
Ok(_) => Ok(false),
|
||||
|
||||
@@ -15,7 +15,9 @@ use control_plane::local_env::{
|
||||
};
|
||||
use control_plane::pageserver::PageServerNode;
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::storage_controller::{
|
||||
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
|
||||
};
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
@@ -52,7 +54,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
const DEFAULT_PG_VERSION: &str = "16";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
|
||||
@@ -1052,6 +1054,36 @@ fn get_start_timeout(args: &ArgMatches) -> &Duration {
|
||||
humantime_duration.as_ref()
|
||||
}
|
||||
|
||||
fn storage_controller_start_args(args: &ArgMatches) -> NeonStorageControllerStartArgs {
|
||||
let maybe_instance_id = args.get_one::<u8>("instance-id");
|
||||
|
||||
let base_port = args.get_one::<u16>("base-port");
|
||||
|
||||
if maybe_instance_id.is_some() && base_port.is_none() {
|
||||
panic!("storage-controller start specificied instance-id but did not provide base-port");
|
||||
}
|
||||
|
||||
let start_timeout = args
|
||||
.get_one::<humantime::Duration>("start-timeout")
|
||||
.expect("invalid value for start-timeout");
|
||||
|
||||
NeonStorageControllerStartArgs {
|
||||
instance_id: maybe_instance_id.copied().unwrap_or(1),
|
||||
base_port: base_port.copied(),
|
||||
start_timeout: *start_timeout,
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_controller_stop_args(args: &ArgMatches) -> NeonStorageControllerStopArgs {
|
||||
let maybe_instance_id = args.get_one::<u8>("instance-id");
|
||||
let immediate = args.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
|
||||
|
||||
NeonStorageControllerStopArgs {
|
||||
instance_id: maybe_instance_id.copied().unwrap_or(1),
|
||||
immediate,
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
@@ -1113,19 +1145,14 @@ async fn handle_storage_controller(
|
||||
let svc = StorageController::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", start_match)) => {
|
||||
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
|
||||
if let Err(e) = svc.start(storage_controller_start_args(start_match)).await {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Some(("stop", stop_match)) => {
|
||||
let immediate = stop_match
|
||||
.get_one::<String>("stop-mode")
|
||||
.map(|s| s.as_str())
|
||||
== Some("immediate");
|
||||
|
||||
if let Err(e) = svc.stop(immediate).await {
|
||||
if let Err(e) = svc.stop(storage_controller_stop_args(stop_match)).await {
|
||||
eprintln!("stop failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1228,7 +1255,12 @@ async fn handle_start_all(
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.start(retry_timeout).await {
|
||||
if let Err(e) = storage_controller
|
||||
.start(NeonStorageControllerStartArgs::with_default_instance_id(
|
||||
(*retry_timeout).into(),
|
||||
))
|
||||
.await
|
||||
{
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
@@ -1358,10 +1390,21 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
eprintln!("neon broker stop failed: {e:#}");
|
||||
}
|
||||
|
||||
if env.control_plane_api.is_some() {
|
||||
// Stop all storage controller instances. In the most common case there's only one,
|
||||
// but iterate though the base data directory in order to discover the instances.
|
||||
let storcon_instances = env
|
||||
.storage_controller_instances()
|
||||
.await
|
||||
.expect("Must inspect data dir");
|
||||
for (instance_id, _instance_dir_path) in storcon_instances {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.stop(immediate).await {
|
||||
eprintln!("storage controller stop failed: {e:#}");
|
||||
let stop_args = NeonStorageControllerStopArgs {
|
||||
instance_id,
|
||||
immediate,
|
||||
};
|
||||
|
||||
if let Err(e) = storage_controller.stop(stop_args).await {
|
||||
eprintln!("Storage controller instance {instance_id} stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1501,6 +1544,18 @@ fn cli() -> Command {
|
||||
.action(ArgAction::SetTrue)
|
||||
.required(false);
|
||||
|
||||
let instance_id = Arg::new("instance-id")
|
||||
.long("instance-id")
|
||||
.help("Identifier used to distinguish storage controller instances (default 1)")
|
||||
.value_parser(value_parser!(u8))
|
||||
.required(false);
|
||||
|
||||
let base_port = Arg::new("base-port")
|
||||
.long("base-port")
|
||||
.help("Base port for the storage controller instance idenfified by instance-id (defaults to pagserver cplane api)")
|
||||
.value_parser(value_parser!(u16))
|
||||
.required(false);
|
||||
|
||||
Command::new("Neon CLI")
|
||||
.arg_required_else_help(true)
|
||||
.version(GIT_VERSION)
|
||||
@@ -1609,9 +1664,12 @@ fn cli() -> Command {
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start storage controller")
|
||||
.arg(timeout_arg.clone()))
|
||||
.arg(timeout_arg.clone())
|
||||
.arg(instance_id.clone())
|
||||
.arg(base_port))
|
||||
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
.arg(stop_mode_arg.clone())
|
||||
.arg(instance_id))
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("safekeeper")
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
|
||||
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
//
|
||||
// This data structures represents neon_local CLI config
|
||||
@@ -156,6 +156,11 @@ pub struct NeonStorageControllerConf {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_warming_up: Duration,
|
||||
|
||||
pub start_as_candidate: bool,
|
||||
|
||||
/// Database url used when running multiple storage controller instances
|
||||
pub database_url: Option<SocketAddr>,
|
||||
|
||||
/// Threshold for auto-splitting a tenant into shards
|
||||
pub split_threshold: Option<u64>,
|
||||
|
||||
@@ -174,6 +179,8 @@ impl Default for NeonStorageControllerConf {
|
||||
Self {
|
||||
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
|
||||
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
|
||||
start_as_candidate: false,
|
||||
database_url: None,
|
||||
split_threshold: None,
|
||||
max_secondary_lag_bytes: None,
|
||||
}
|
||||
@@ -392,6 +399,36 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
/// Inspect the base data directory and extract the instance id and instance directory path
|
||||
/// for all storage controller instances
|
||||
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
|
||||
let mut instances = Vec::default();
|
||||
|
||||
let dir = std::fs::read_dir(self.base_data_dir.clone())?;
|
||||
for dentry in dir {
|
||||
let dentry = dentry?;
|
||||
let is_dir = dentry.metadata()?.is_dir();
|
||||
let filename = dentry.file_name().into_string().unwrap();
|
||||
let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
|
||||
Some(suffix) => suffix.parse::<u8>().ok(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let is_instance_dir = is_dir && parsed_instance_id.is_some();
|
||||
|
||||
if !is_instance_dir {
|
||||
continue;
|
||||
}
|
||||
|
||||
instances.push((
|
||||
parsed_instance_id.expect("Checked previously"),
|
||||
dentry.path(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(instances)
|
||||
}
|
||||
|
||||
pub fn register_branch_mapping(
|
||||
&mut self,
|
||||
branch_name: String,
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
use std::future::Future;
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
@@ -34,12 +35,10 @@ pub enum SafekeeperHttpError {
|
||||
|
||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
async fn error_from_body(self) -> Result<Self>;
|
||||
pub(crate) trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
|
||||
@@ -3,6 +3,8 @@ use crate::{
|
||||
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||
};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper::Uri;
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
@@ -18,7 +20,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{fs, str::FromStr, time::Duration};
|
||||
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
@@ -29,12 +31,14 @@ use utils::{
|
||||
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
config: NeonStorageControllerConf,
|
||||
|
||||
// The listen addresses is learned when starting the storage controller,
|
||||
// hence the use of OnceLock to init it at the right time.
|
||||
listen: OnceLock<SocketAddr>,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
@@ -43,6 +47,36 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
|
||||
pub struct NeonStorageControllerStartArgs {
|
||||
pub instance_id: u8,
|
||||
pub base_port: Option<u16>,
|
||||
pub start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerStartArgs {
|
||||
pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
|
||||
Self {
|
||||
instance_id: 1,
|
||||
base_port: None,
|
||||
start_timeout,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NeonStorageControllerStopArgs {
|
||||
pub instance_id: u8,
|
||||
pub immediate: bool,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerStopArgs {
|
||||
pub fn with_default_instance_id(immediate: bool) -> Self {
|
||||
Self {
|
||||
instance_id: 1,
|
||||
immediate,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -67,23 +101,6 @@ pub struct InspectResponse {
|
||||
|
||||
impl StorageController {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
let listen_url = env.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
|
||||
// port, for use by our captive postgres.
|
||||
let postgres_port = listen_url
|
||||
.port()
|
||||
.expect("Control plane API setting should always have a port")
|
||||
+ 1;
|
||||
|
||||
// Assume all pageservers have symmetric auth configuration: this service
|
||||
// expects to use one JWT token to talk to all of them.
|
||||
let ps_conf = env
|
||||
@@ -126,20 +143,28 @@ impl StorageController {
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
listen,
|
||||
private_key,
|
||||
public_key,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
config: env.storage_controller.clone(),
|
||||
listen: OnceLock::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||
.expect("non-Unicode path")
|
||||
fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join(format!("storage_controller_{}", instance_id))
|
||||
}
|
||||
|
||||
fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.storage_controller_instance_dir(instance_id)
|
||||
.join("storage_controller.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store storage controller state
|
||||
@@ -184,23 +209,23 @@ impl StorageController {
|
||||
}
|
||||
|
||||
/// Readiness check for our postgres process
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
|
||||
let bin_path = pg_bin_dir.join("pg_isready");
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", postgres_port)];
|
||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
|
||||
/// Create our database if it doesn't exist, and run migrations.
|
||||
/// Create our database if it doesn't exist
|
||||
///
|
||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||
/// who just want to run `cargo neon_local` without knowing about diesel.
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let createdb_path = pg_bin_dir.join("createdb");
|
||||
@@ -209,7 +234,7 @@ impl StorageController {
|
||||
"-h",
|
||||
"localhost",
|
||||
"-p",
|
||||
&format!("{}", self.postgres_port),
|
||||
&format!("{}", postgres_port),
|
||||
DB_NAME,
|
||||
])
|
||||
.output()
|
||||
@@ -230,13 +255,14 @@ impl StorageController {
|
||||
|
||||
pub async fn connect_to_database(
|
||||
&self,
|
||||
postgres_port: u16,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
tokio_postgres::Config::new()
|
||||
.host("localhost")
|
||||
.port(self.postgres_port)
|
||||
.port(postgres_port)
|
||||
// The user is the ambient operating system user name.
|
||||
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
|
||||
//
|
||||
@@ -252,72 +278,114 @@ impl StorageController {
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
|
||||
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
|
||||
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
|
||||
if err.kind() != std::io::ErrorKind::AlreadyExists {
|
||||
panic!("Failed to create instance dir {instance_dir:?}");
|
||||
}
|
||||
}
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.envs(vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
])
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
let (listen, postgres_port) = {
|
||||
if let Some(base_port) = start_args.base_port {
|
||||
(
|
||||
format!("127.0.0.1:{base_port}"),
|
||||
self.config
|
||||
.database_url
|
||||
.expect("--base-port requires NeonStorageControllerConf::database_url")
|
||||
.port(),
|
||||
)
|
||||
} else {
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
(listen, listen_url.port().unwrap() + 1)
|
||||
}
|
||||
};
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
let socket_addr = listen
|
||||
.parse()
|
||||
.expect("listen address is a valid socket address");
|
||||
self.listen
|
||||
.set(socket_addr)
|
||||
.expect("StorageController::listen is only set here");
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
// Do we remove the pid file on stop?
|
||||
let pg_started = self.is_postgres_running().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
retry_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
if !pg_started {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.envs(vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
])
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
};
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
&start_args.start_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir, postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
self.setup_database(postgres_port).await?;
|
||||
}
|
||||
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
||||
|
||||
// We support running a startup SQL script to fiddle with the database before we launch storcon.
|
||||
// This is used by the test suite.
|
||||
@@ -339,7 +407,7 @@ impl StorageController {
|
||||
}
|
||||
}
|
||||
};
|
||||
let (mut client, conn) = self.connect_to_database().await?;
|
||||
let (mut client, conn) = self.connect_to_database(postgres_port).await?;
|
||||
let conn = tokio::spawn(conn);
|
||||
let tx = client.build_transaction();
|
||||
let tx = tx.start().await?;
|
||||
@@ -348,9 +416,20 @@ impl StorageController {
|
||||
drop(client);
|
||||
conn.await??;
|
||||
|
||||
let listen = self
|
||||
.listen
|
||||
.get()
|
||||
.expect("cell is set earlier in this function");
|
||||
let address_for_peers = Uri::builder()
|
||||
.scheme("http")
|
||||
.authority(format!("{}:{}", listen.ip(), listen.port()))
|
||||
.path_and_query("")
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
&listen.to_string(),
|
||||
"--dev",
|
||||
"--database-url",
|
||||
&database_url,
|
||||
@@ -358,15 +437,27 @@ impl StorageController {
|
||||
&humantime::Duration::from(self.config.max_offline).to_string(),
|
||||
"--max-warming-up-interval",
|
||||
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
||||
"--address-for-peers",
|
||||
&address_for_peers.to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if self.config.start_as_candidate {
|
||||
args.push("--start-as-candidate".to_string());
|
||||
}
|
||||
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
|
||||
let peer_claims = Claims::new(None, Scope::Admin);
|
||||
let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
|
||||
.expect("failed to generate jwt token");
|
||||
args.push(format!("--peer-jwt-token={peer_jwt_token}"));
|
||||
}
|
||||
|
||||
if let Some(public_key) = &self.public_key {
|
||||
@@ -394,15 +485,15 @@ impl StorageController {
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&instance_dir,
|
||||
&self.env.storage_controller_bin(),
|
||||
args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
retry_timeout,
|
||||
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
|
||||
&start_args.start_timeout,
|
||||
|| async {
|
||||
match self.ready().await {
|
||||
Ok(_) => Ok(true),
|
||||
@@ -415,8 +506,35 @@ impl StorageController {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
|
||||
background_process::stop_process(
|
||||
stop_args.immediate,
|
||||
COMMAND,
|
||||
&self.pid_file(stop_args.instance_id),
|
||||
)?;
|
||||
|
||||
let storcon_instances = self.env.storage_controller_instances().await?;
|
||||
for (instance_id, instanced_dir_path) in storcon_instances {
|
||||
if instance_id == stop_args.instance_id {
|
||||
continue;
|
||||
}
|
||||
|
||||
let pid_file = instanced_dir_path.join("storage_controller.pid");
|
||||
let pid = tokio::fs::read_to_string(&pid_file)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
|
||||
})?
|
||||
.parse::<i32>()
|
||||
.expect("pid is valid i32");
|
||||
|
||||
let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
|
||||
if other_proc_alive {
|
||||
// There is another storage controller instance running, so we return
|
||||
// and leave the database running.
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -429,27 +547,51 @@ impl StorageController {
|
||||
.wait()
|
||||
.await?;
|
||||
if !stop_status.success() {
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||
match self.is_postgres_running().await {
|
||||
Ok(false) => {
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
}
|
||||
Ok(true) => {
|
||||
anyhow::bail!("Failed to stop storage controller database");
|
||||
}
|
||||
Err(err) => {
|
||||
anyhow::bail!("Failed to stop storage controller database: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
const PG_NO_DATA_DIR: i32 = 4;
|
||||
const PG_STATUS_RUNNING: i32 = 0;
|
||||
match status_exitcode.code() {
|
||||
Some(PG_STATUS_NOT_RUNNING) => Ok(false),
|
||||
Some(PG_NO_DATA_DIR) => Ok(false),
|
||||
Some(PG_STATUS_RUNNING) => Ok(true),
|
||||
Some(code) => Err(anyhow::anyhow!(
|
||||
"pg_ctl status returned unexpected status code: {:?}",
|
||||
code
|
||||
)),
|
||||
None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
|
||||
let category = match path.find('/') {
|
||||
Some(idx) => &path[..idx],
|
||||
@@ -475,15 +617,31 @@ impl StorageController {
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
// In the special case of the `storage_controller start` subcommand, we wish
|
||||
// to use the API endpoint of the newly started storage controller in order
|
||||
// to pass the readiness check. In this scenario [`Self::listen`] will be set
|
||||
// (see [`Self::start`]).
|
||||
//
|
||||
// Otherwise, we infer the storage controller api endpoint from the configured
|
||||
// control plane API.
|
||||
let url = if let Some(socket_addr) = self.listen.get() {
|
||||
Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
socket_addr.ip().to_canonical(),
|
||||
socket_addr.port()
|
||||
))
|
||||
.unwrap()
|
||||
} else {
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
|
||||
@@ -41,6 +41,8 @@ enum Command {
|
||||
listen_http_addr: String,
|
||||
#[arg(long)]
|
||||
listen_http_port: u16,
|
||||
#[arg(long)]
|
||||
availability_zone_id: String,
|
||||
},
|
||||
|
||||
/// Modify a node's configuration in the storage controller
|
||||
@@ -147,9 +149,9 @@ enum Command {
|
||||
#[arg(long)]
|
||||
threshold: humantime::Duration,
|
||||
},
|
||||
// Drain a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// outside of the specified set.
|
||||
Drain {
|
||||
BulkMigrate {
|
||||
// Set of pageserver node ids to drain.
|
||||
#[arg(long)]
|
||||
nodes: Vec<NodeId>,
|
||||
@@ -163,6 +165,34 @@ enum Command {
|
||||
#[arg(long)]
|
||||
dry_run: Option<bool>,
|
||||
},
|
||||
/// Start draining the specified pageserver.
|
||||
/// The drain is complete when the schedulling policy returns to active.
|
||||
StartDrain {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel draining the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
CancelDrain {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
/// Start filling the specified pageserver.
|
||||
/// The drain is complete when the schedulling policy returns to active.
|
||||
StartFill {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel filling the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
CancelFill {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -249,6 +279,34 @@ impl FromStr for NodeAvailabilityArg {
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_scheduling_policy<F>(
|
||||
client: Client,
|
||||
node_id: NodeId,
|
||||
timeout: Duration,
|
||||
f: F,
|
||||
) -> anyhow::Result<NodeSchedulingPolicy>
|
||||
where
|
||||
F: Fn(NodeSchedulingPolicy) -> bool,
|
||||
{
|
||||
let waiter = tokio::time::timeout(timeout, async move {
|
||||
loop {
|
||||
let node = client
|
||||
.dispatch::<(), NodeDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/node/{node_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if f(node.scheduling) {
|
||||
return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(waiter.await??)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
@@ -266,6 +324,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id,
|
||||
} => {
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
@@ -277,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id: Some(availability_zone_id),
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
@@ -628,7 +688,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::Drain {
|
||||
Command::BulkMigrate {
|
||||
nodes,
|
||||
concurrency,
|
||||
max_shards,
|
||||
@@ -657,7 +717,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
if nodes.len() != node_to_drain_descs.len() {
|
||||
anyhow::bail!("Drain requested for node which doesn't exist.")
|
||||
anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
|
||||
}
|
||||
|
||||
node_to_fill_descs.retain(|desc| {
|
||||
@@ -669,7 +729,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
});
|
||||
|
||||
if node_to_fill_descs.is_empty() {
|
||||
anyhow::bail!("There are no nodes to drain to")
|
||||
anyhow::bail!("There are no nodes to migrate to")
|
||||
}
|
||||
|
||||
// Set the node scheduling policy to draining for the nodes which
|
||||
@@ -690,7 +750,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Perform the drain: move each tenant shard scheduled on a node to
|
||||
// Perform the migration: move each tenant shard scheduled on a node to
|
||||
// be drained to a node which is being filled. A simple round robin
|
||||
// strategy is used to pick the new node.
|
||||
let tenants = storcon_client
|
||||
@@ -703,13 +763,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let mut selected_node_idx = 0;
|
||||
|
||||
struct DrainMove {
|
||||
struct MigrationMove {
|
||||
tenant_shard_id: TenantShardId,
|
||||
from: NodeId,
|
||||
to: NodeId,
|
||||
}
|
||||
|
||||
let mut moves: Vec<DrainMove> = Vec::new();
|
||||
let mut moves: Vec<MigrationMove> = Vec::new();
|
||||
|
||||
let shards = tenants
|
||||
.into_iter()
|
||||
@@ -739,7 +799,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
continue;
|
||||
}
|
||||
|
||||
moves.push(DrainMove {
|
||||
moves.push(MigrationMove {
|
||||
tenant_shard_id: shard.tenant_shard_id,
|
||||
from: shard
|
||||
.node_attached
|
||||
@@ -816,6 +876,67 @@ async fn main() -> anyhow::Result<()> {
|
||||
failure
|
||||
);
|
||||
}
|
||||
Command::StartDrain { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/drain"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
println!("Drain started for {node_id}");
|
||||
}
|
||||
Command::CancelDrain { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/drain"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
use NodeSchedulingPolicy::*;
|
||||
matches!(sched, Active | PauseForRestart)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
Command::StartFill { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
|
||||
.await?;
|
||||
|
||||
println!("Fill started for {node_id}");
|
||||
}
|
||||
Command::CancelFill { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/fill"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
use NodeSchedulingPolicy::*;
|
||||
matches!(sched, Active)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -3,7 +3,7 @@ set -x
|
||||
|
||||
cd /ext-src || exit 2
|
||||
FAILED=
|
||||
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d "${d}" ] || continue
|
||||
|
||||
@@ -14,7 +14,7 @@ picked tenant (which requested on-demand activation) for around 30 seconds
|
||||
during the restart at 2024-04-03 16:37 UTC.
|
||||
|
||||
Note that lots of shutdowns on loaded pageservers do not finish within the
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
[10 second systemd enforced timeout](https://github.com/neondatabase/infra/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
|
||||
and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
|
||||
|
||||
This problem is not yet very acutely felt in storage controller managed pageservers since
|
||||
|
||||
259
docs/rfcs/037-storage-controller-restarts.md
Normal file
259
docs/rfcs/037-storage-controller-restarts.md
Normal file
@@ -0,0 +1,259 @@
|
||||
# Rolling Storage Controller Restarts
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes the issues around the current storage controller restart procedure
|
||||
and describes an implementation which reduces downtime to a few milliseconds on the happy path.
|
||||
|
||||
## Motivation
|
||||
|
||||
Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
|
||||
While the storage controller does not sit on the main data path, it's generally not acceptable
|
||||
to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
|
||||
|
||||
### Current Implementation
|
||||
|
||||
The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
|
||||
In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
|
||||
a new instance is created.
|
||||
|
||||
At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
|
||||
latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
|
||||
under unfavourable circumstances: pageservers are heavily loaded or unavailable.
|
||||
|
||||
## Prior Art
|
||||
|
||||
There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
|
||||
* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
|
||||
For fail-over, traffic is routed to one of the standbys (which becomes active).
|
||||
* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
|
||||
and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
|
||||
|
||||
## Requirements
|
||||
|
||||
* Reduce storage controller unavailability during upgrades to milliseconds
|
||||
* Minimize the interval in which it's possible for more than one storage controller
|
||||
to issue reconciles.
|
||||
* Have one uniform implementation for restarts and upgrades
|
||||
* Fit in with the current Kubernetes deployment scheme
|
||||
|
||||
## Non Goals
|
||||
|
||||
* Implement our own consensus algorithm from scratch
|
||||
* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
|
||||
like a transient error to the control plane
|
||||
|
||||
## Impacted Components
|
||||
|
||||
* storage controller
|
||||
* deployment orchestration (i.e. Ansible)
|
||||
* helm charts
|
||||
|
||||
## Terminology
|
||||
|
||||
* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
|
||||
at start-up by quering pageservers
|
||||
* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
|
||||
a set of replicas
|
||||
|
||||
## Implementation
|
||||
|
||||
### High Level Flow
|
||||
|
||||
At a very high level the proposed idea is to start a new storage controller instance while
|
||||
the previous one is still running and cut-over to it when it becomes ready. The new instance,
|
||||
should coordinate with the existing one and transition responsibility gracefully. While the controller
|
||||
has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
|
||||
scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
|
||||
were operating at the same time and require operator intervention to remedy.
|
||||
|
||||
### Kubernetes Deployment Configuration
|
||||
|
||||
On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
|
||||
to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
|
||||
Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
|
||||
scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
|
||||
|
||||
The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
|
||||
|
||||
### Storage Controller Start-Up
|
||||
|
||||
This section describes the primitives required on the storage controller side and the flow of the happy path.
|
||||
|
||||
#### Database Table For Leader Synchronization
|
||||
|
||||
A new table should be added to the storage controller database for leader synchronization during startup.
|
||||
This table will always contain at most one row. The proposed name for the table is `leader` and the schema
|
||||
contains two elements:
|
||||
* `hostname`: represents the hostname for the current storage controller leader - should be addressible
|
||||
from other pods in the deployment
|
||||
* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
|
||||
for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
|
||||
|
||||
Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
|
||||
at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
|
||||
situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
|
||||
level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
|
||||
READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
|
||||
the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
|
||||
our needs here.
|
||||
|
||||
```
|
||||
START TRANSACTION ISOLATION LEVEL REPEATABLE READ
|
||||
UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
|
||||
WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
|
||||
```
|
||||
|
||||
If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
|
||||
|
||||
#### Step Down API
|
||||
|
||||
A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
|
||||
request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
|
||||
and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
|
||||
snapshot of the observed state.
|
||||
|
||||
If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
|
||||
for failure scenario handling - see [Handling Failures](#handling-failures)).
|
||||
|
||||
#### Graceful Restart Happy Path
|
||||
|
||||
At start-up, the first thing the storage controller does is retrieve the sole row from the new
|
||||
`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
|
||||
This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
|
||||
observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
|
||||
pageservers in order to build up the observed state.
|
||||
|
||||
Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
|
||||
section. If this step fails, the storage controller process exits.
|
||||
|
||||
Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
|
||||
(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
|
||||
|
||||
Summary of proposed new start-up sequence:
|
||||
1. Call `/step_down`
|
||||
2. Perform any pending database migrations
|
||||
3. Load state from database
|
||||
4. Load observed state returned in step (1) into memory
|
||||
5. Do initial heartbeat round (may be moved after 5)
|
||||
7. Mark self as leader by updating the database
|
||||
8. Reschedule and reconcile everything
|
||||
|
||||
Some things to note from the steps above:
|
||||
* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
|
||||
calls to the pageserver and no compute notifications)
|
||||
* Ask the current leader to step down before loading state from database so we don't get a lost update
|
||||
if the transactions overlap.
|
||||
* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
|
||||
fall back to asking the pageservers about their current locations.
|
||||
* Database migrations should only run **after** the previous instance steps down (or the step down times out).
|
||||
|
||||
|
||||
[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
|
||||
so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
|
||||
|
||||
### Handling Failures
|
||||
|
||||
#### Storage Controller Crash Or Restart
|
||||
|
||||
The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
|
||||
`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
|
||||
start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
|
||||
exists and consistency is maintained.
|
||||
|
||||
#### Previous Leader Crashes Before New Leader Readiness
|
||||
|
||||
When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
|
||||
reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
|
||||
(see [2]).
|
||||
|
||||
Now we have two cases to consider:
|
||||
* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
|
||||
by Kubernetes depending on timings.
|
||||
* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
|
||||
The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
|
||||
create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
|
||||
|
||||
[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
|
||||
should avoid this self reference and fail the API call at the client if the persisted hostname matches
|
||||
the current one.
|
||||
|
||||
#### Previous Leader Crashes After New Leader Readiness
|
||||
|
||||
The deployment's replica sets already satisfy the deployment's replica count requirements and the
|
||||
Kubernetes deployment rollout will just clean up the dead pod.
|
||||
|
||||
#### New Leader Crashes Before Pasing Readiness Check
|
||||
|
||||
The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
|
||||
with the new pod.
|
||||
|
||||
#### Network Partition Between New Pod and Previous Leader
|
||||
|
||||
This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
|
||||
API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
|
||||
Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
|
||||
|
||||
### Dealing With Split Brain Scenarios
|
||||
|
||||
As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
|
||||
duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
|
||||
scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
|
||||
The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
|
||||
|
||||
### Ensure Leadership Before Producing Side Effects
|
||||
|
||||
The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
|
||||
Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
|
||||
applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
|
||||
|
||||
### Leadership Lease
|
||||
|
||||
Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
|
||||
to be renewed periodically. Two new columns would be added to the leaders table:
|
||||
1. `last_renewed` - timestamp indicating when the lease was last renewed
|
||||
2. `lease_duration` - duration indicating the amount of time after which the lease expires
|
||||
|
||||
The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
|
||||
same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
|
||||
to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
|
||||
|
||||
### Notify Pageserver Of Storage Controller Term
|
||||
|
||||
Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
|
||||
Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
|
||||
anything which contains a stale term (i.e. smaller than the current one).
|
||||
|
||||
### Observability
|
||||
|
||||
* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
|
||||
Per region alerts should be added on this metric which triggers when:
|
||||
+ no storage controller has been in the `Active` state for an extended period of time
|
||||
+ more than one storage controllers are in the `Active` state
|
||||
|
||||
* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
|
||||
We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
|
||||
|
||||
## Alternatives
|
||||
|
||||
### Kubernetes Leases
|
||||
|
||||
Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
|
||||
Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
|
||||
|
||||
In our case, it would work something like this:
|
||||
* `/step_down` deletes the lease or stops it from renewing
|
||||
* lease acquisition becomes part of the start-up procedure
|
||||
|
||||
The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
|
||||
not exactly trivial to implement.
|
||||
|
||||
This approach has the benefit of baked in observability (`kubectl describe lease`), but:
|
||||
* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
|
||||
* More code surface than the simple "row in database" approach. Also, most of this code would be in
|
||||
a dependency not subject to code review, etc.
|
||||
* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
|
||||
so is not simple and complictes and the test set-up.
|
||||
|
||||
To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
|
||||
to something external.
|
||||
@@ -21,30 +21,21 @@ _Example: 15.4 is the new minor version to upgrade to from 15.3._
|
||||
1. Create a new branch based on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git checkout -b my-branch REL_15_STABLE_neon
|
||||
git checkout -b my-branch-15 REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
1. Tag the last commit on the stable branch you are updating.
|
||||
1. Find the upstream release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
```shell
|
||||
git tag REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Push the new tag to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
1. Rebase the branch you created on the tag and resolve any conflicts.
|
||||
1. Merge the upstream tag into the branch you created on the tag and resolve any conflicts.
|
||||
|
||||
```shell
|
||||
git fetch upstream REL_15_4
|
||||
git rebase REL_15_4
|
||||
git merge REL_15_4
|
||||
```
|
||||
|
||||
In the commit message of the merge commit, mention if there were
|
||||
any non-trivial conflicts or other issues.
|
||||
|
||||
1. Run the Postgres test suite to make sure our commits have not affected
|
||||
Postgres in a negative way.
|
||||
|
||||
@@ -57,7 +48,7 @@ Postgres in a negative way.
|
||||
1. Push your branch to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin my-branch
|
||||
git push origin my-branch-15
|
||||
```
|
||||
|
||||
1. Clone the Neon repository if you have not done so already.
|
||||
@@ -74,7 +65,7 @@ branch.
|
||||
1. Update the Git submodule.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch my-branch vendor/postgres-v15
|
||||
git submodule set-branch --branch my-branch-15 vendor/postgres-v15
|
||||
git submodule update --remote vendor/postgres-v15
|
||||
```
|
||||
|
||||
@@ -89,14 +80,12 @@ minor Postgres release.
|
||||
|
||||
1. Create a pull request, and wait for CI to go green.
|
||||
|
||||
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
||||
1. Push the Postgres branches with the merge commits into the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push --force origin my-branch:REL_15_STABLE_neon
|
||||
git push origin my-branch-15:REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
It may require disabling various branch protections.
|
||||
|
||||
1. Update your Neon PR to point at the branches.
|
||||
|
||||
```shell
|
||||
|
||||
@@ -14,5 +14,3 @@ regex.workspace = true
|
||||
|
||||
utils = { path = "../utils" }
|
||||
remote_storage = { version = "0.1", path = "../remote_storage/" }
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -6,10 +6,8 @@ license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
chrono.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -14,5 +14,3 @@ parking_lot.workspace = true
|
||||
hex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -12,8 +12,6 @@ chrono.workspace = true
|
||||
twox-hash.workspace = true
|
||||
measured.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
measured-process.workspace = true
|
||||
|
||||
@@ -21,11 +21,9 @@ hex.workspace = true
|
||||
humantime.workspace = true
|
||||
thiserror.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
chrono.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
itertools.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
rand.workspace = true
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::time::{Duration, Instant};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use crate::models::PageserverUtilization;
|
||||
use crate::{
|
||||
models::{ShardParameters, TenantConfig},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
@@ -55,6 +56,8 @@ pub struct NodeRegisterRequest {
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
pub availability_zone_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -140,23 +143,11 @@ pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
/// Utilisation score indicating how good a candidate a pageserver
|
||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||
/// Lower values are better.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
|
||||
pub struct UtilizationScore(pub u64);
|
||||
|
||||
impl UtilizationScore {
|
||||
pub fn worst() -> Self {
|
||||
UtilizationScore(u64::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, Debug)]
|
||||
#[derive(Serialize, Clone, Debug)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active(UtilizationScore),
|
||||
Active(PageserverUtilization),
|
||||
// Node is warming up, but we expect it to become available soon. Covers
|
||||
// the time span between the re-attach response being composed on the storage controller
|
||||
// and the first successful heartbeat after the processing of the re-attach response
|
||||
@@ -195,7 +186,9 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
||||
match val {
|
||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||
// the heartbeats.
|
||||
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
||||
NodeAvailabilityWrapper::Active => {
|
||||
NodeAvailability::Active(PageserverUtilization::full())
|
||||
}
|
||||
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
|
||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||
}
|
||||
|
||||
@@ -108,14 +108,41 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
/// This function checks more extensively what keys we can take on the write path.
|
||||
/// If a key beginning with 00 does not have a global/default tablespace OID, it
|
||||
/// will be rejected on the write path.
|
||||
#[allow(dead_code)]
|
||||
pub fn is_valid_key_on_write_path_strong(&self) -> bool {
|
||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
if !self.is_i128_representable() {
|
||||
return false;
|
||||
}
|
||||
if self.field1 == 0
|
||||
&& !(self.field2 == GLOBALTABLESPACE_OID
|
||||
|| self.field2 == DEFAULTTABLESPACE_OID
|
||||
|| self.field2 == 0)
|
||||
{
|
||||
return false; // User defined tablespaces are not supported
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// This is a weaker version of `is_valid_key_on_write_path_strong` that simply
|
||||
/// checks if the key is i128 representable. Note that some keys can be successfully
|
||||
/// ingested into the pageserver, but will cause errors on generating basebackup.
|
||||
pub fn is_valid_key_on_write_path(&self) -> bool {
|
||||
self.is_i128_representable()
|
||||
}
|
||||
|
||||
pub fn is_i128_representable(&self) -> bool {
|
||||
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222
|
||||
}
|
||||
|
||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||
pub fn to_i128(&self) -> i128 {
|
||||
assert!(
|
||||
self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222,
|
||||
"invalid key: {self}",
|
||||
);
|
||||
assert!(self.is_i128_representable(), "invalid key: {self}");
|
||||
(((self.field1 & 0x7F) as i128) << 120)
|
||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||
| ((self.field3 as i128) << 72)
|
||||
@@ -236,6 +263,15 @@ impl Key {
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX,
|
||||
};
|
||||
/// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
|
||||
pub const NON_L0_MAX: Key = Key {
|
||||
field1: u8::MAX,
|
||||
field2: u32::MAX,
|
||||
field3: u32::MAX,
|
||||
field4: u32::MAX,
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX - 1,
|
||||
};
|
||||
|
||||
pub fn from_hex(s: &str) -> Result<Self> {
|
||||
if s.len() != 36 {
|
||||
|
||||
@@ -48,7 +48,7 @@ pub struct ShardedRange<'a> {
|
||||
|
||||
// Calculate the size of a range within the blocks of the same relation, or spanning only the
|
||||
// top page in the previous relation's space.
|
||||
fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||
pub fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||
debug_assert!(is_contiguous_range(range));
|
||||
if range.start.field6 == 0xffffffff {
|
||||
range.end.field6 + 1
|
||||
@@ -67,7 +67,7 @@ fn contiguous_range_len(range: &Range<Key>) -> u32 {
|
||||
/// This matters, because:
|
||||
/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse.
|
||||
/// - Within such ranges, we may calculate distances using simple subtraction of field6.
|
||||
fn is_contiguous_range(range: &Range<Key>) -> bool {
|
||||
pub fn is_contiguous_range(range: &Range<Key>) -> bool {
|
||||
range.start.field1 == range.end.field1
|
||||
&& range.start.field2 == range.end.field2
|
||||
&& range.start.field3 == range.end.field3
|
||||
|
||||
@@ -7,7 +7,7 @@ pub use utilization::PageserverUtilization;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
sync::atomic::AtomicUsize,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -348,7 +348,7 @@ impl AuxFilePolicy {
|
||||
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V1
|
||||
Self::V2
|
||||
}
|
||||
}
|
||||
|
||||
@@ -486,12 +486,11 @@ pub struct EvictionPolicyLayerAccessThreshold {
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
|
||||
pub struct ThrottleConfig {
|
||||
pub task_kinds: Vec<String>, // TaskKind
|
||||
pub initial: usize,
|
||||
pub initial: u32,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub refill_interval: Duration,
|
||||
pub refill_amount: NonZeroUsize,
|
||||
pub max: usize,
|
||||
pub fair: bool,
|
||||
pub refill_amount: NonZeroU32,
|
||||
pub max: u32,
|
||||
}
|
||||
|
||||
impl ThrottleConfig {
|
||||
@@ -501,9 +500,8 @@ impl ThrottleConfig {
|
||||
// other values don't matter with emtpy `task_kinds`.
|
||||
initial: 0,
|
||||
refill_interval: Duration::from_millis(1),
|
||||
refill_amount: NonZeroUsize::new(1).unwrap(),
|
||||
refill_amount: NonZeroU32::new(1).unwrap(),
|
||||
max: 1,
|
||||
fair: true,
|
||||
}
|
||||
}
|
||||
/// The requests per second allowed by the given config.
|
||||
@@ -718,6 +716,7 @@ pub struct TimelineInfo {
|
||||
pub pg_version: u32,
|
||||
|
||||
pub state: TimelineState,
|
||||
pub is_archived: bool,
|
||||
|
||||
pub walreceiver_status: String,
|
||||
|
||||
@@ -1062,7 +1061,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
}
|
||||
}
|
||||
|
||||
// In the V2 protocol version, a GetPage request contains two LSN values:
|
||||
// A GetPage request contains two LSN values:
|
||||
//
|
||||
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
||||
// "get the latest version present". It's used by the primary server, which knows that no one else
|
||||
@@ -1075,7 +1074,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
||||
// request without waiting for 'request_lsn' to arrive.
|
||||
//
|
||||
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||
// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
||||
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
||||
// standby to request a page at a particular non-latest LSN, and also include the
|
||||
@@ -1083,15 +1082,11 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
||||
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
||||
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
|
||||
// difference in the responses between V1 and V2.
|
||||
//
|
||||
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
|
||||
// maps the old format requests to the new format.
|
||||
//
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PagestreamProtocolVersion {
|
||||
V1,
|
||||
V2,
|
||||
}
|
||||
|
||||
@@ -1230,36 +1225,17 @@ impl PagestreamFeMessage {
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn parse<R: std::io::Read>(
|
||||
body: &mut R,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
) -> anyhow::Result<PagestreamFeMessage> {
|
||||
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.read_u8()?;
|
||||
|
||||
let (request_lsn, not_modified_since) = match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => (
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
PagestreamProtocolVersion::V1 => {
|
||||
// In the old protocol, each message starts with a boolean 'latest' flag,
|
||||
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
|
||||
// 'not_modified_since', used in the new protocol version.
|
||||
let latest = body.read_u8()? != 0;
|
||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
if latest {
|
||||
(Lsn::MAX, request_lsn) // get latest version
|
||||
} else {
|
||||
(request_lsn, request_lsn) // get version at specified LSN
|
||||
}
|
||||
}
|
||||
};
|
||||
// these two fields are the same for every request type
|
||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
|
||||
// The rest of the messages are the same between V1 and V2
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
request_lsn,
|
||||
@@ -1467,9 +1443,7 @@ mod tests {
|
||||
];
|
||||
for msg in messages {
|
||||
let bytes = msg.serialize();
|
||||
let reconstructed =
|
||||
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
|
||||
.unwrap();
|
||||
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
||||
assert!(msg == reconstructed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@ pub struct PageserverUtilization {
|
||||
pub max_shard_count: u32,
|
||||
|
||||
/// Cached result of [`Self::score`]
|
||||
pub utilization_score: u64,
|
||||
pub utilization_score: Option<u64>,
|
||||
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
@@ -50,6 +50,8 @@ fn unity_percent() -> Percent {
|
||||
Percent::new(0).unwrap()
|
||||
}
|
||||
|
||||
pub type RawScore = u64;
|
||||
|
||||
impl PageserverUtilization {
|
||||
const UTILIZATION_FULL: u64 = 1000000;
|
||||
|
||||
@@ -62,7 +64,7 @@ impl PageserverUtilization {
|
||||
/// - Negative values are forbidden
|
||||
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
|
||||
/// layer eviction.
|
||||
pub fn score(&self) -> u64 {
|
||||
pub fn score(&self) -> RawScore {
|
||||
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
|
||||
* self.disk_usable_pct.get() as u64)
|
||||
/ 100;
|
||||
@@ -74,8 +76,30 @@ impl PageserverUtilization {
|
||||
std::cmp::max(disk_utilization_score, shard_utilization_score)
|
||||
}
|
||||
|
||||
pub fn refresh_score(&mut self) {
|
||||
self.utilization_score = self.score();
|
||||
pub fn cached_score(&mut self) -> RawScore {
|
||||
match self.utilization_score {
|
||||
None => {
|
||||
let s = self.score();
|
||||
self.utilization_score = Some(s);
|
||||
s
|
||||
}
|
||||
Some(s) => s,
|
||||
}
|
||||
}
|
||||
|
||||
/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
|
||||
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
|
||||
pub fn is_overloaded(score: RawScore) -> bool {
|
||||
score >= Self::UTILIZATION_FULL
|
||||
}
|
||||
|
||||
pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
|
||||
if self.shard_count < shard_count {
|
||||
self.shard_count = shard_count;
|
||||
|
||||
// Dirty cache: this will be calculated next time someone retrives the score
|
||||
self.utilization_score = None;
|
||||
}
|
||||
}
|
||||
|
||||
/// A utilization structure that has a full utilization score: use this as a placeholder when
|
||||
@@ -88,7 +112,38 @@ impl PageserverUtilization {
|
||||
disk_usable_pct: Percent::new(100).unwrap(),
|
||||
shard_count: 1,
|
||||
max_shard_count: 1,
|
||||
utilization_score: Self::UTILIZATION_FULL,
|
||||
utilization_score: Some(Self::UTILIZATION_FULL),
|
||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Test helper
|
||||
pub mod test_utilization {
|
||||
use super::PageserverUtilization;
|
||||
use std::time::SystemTime;
|
||||
use utils::{
|
||||
serde_percent::Percent,
|
||||
serde_system_time::{self},
|
||||
};
|
||||
|
||||
// Parameters of the imaginary node used for test utilization instances
|
||||
const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
|
||||
const TEST_SHARDS_MAX: u32 = 1000;
|
||||
|
||||
/// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
|
||||
/// not abuse this function from non-test code.
|
||||
///
|
||||
/// Emulates a node with a 1000 shard limit and a 1TB disk.
|
||||
pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
|
||||
PageserverUtilization {
|
||||
disk_usage_bytes: disk_wanted_bytes,
|
||||
free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
|
||||
disk_wanted_bytes,
|
||||
disk_usable_pct: Percent::new(100).unwrap(),
|
||||
shard_count,
|
||||
max_shard_count: TEST_SHARDS_MAX,
|
||||
utilization_score: None,
|
||||
captured_at: serde_system_time::SystemTime(SystemTime::now()),
|
||||
}
|
||||
}
|
||||
@@ -120,7 +175,7 @@ mod tests {
|
||||
disk_usage_bytes: u64::MAX,
|
||||
free_space_bytes: 0,
|
||||
disk_wanted_bytes: u64::MAX,
|
||||
utilization_score: 13,
|
||||
utilization_score: Some(13),
|
||||
disk_usable_pct: Percent::new(90).unwrap(),
|
||||
shard_count: 100,
|
||||
max_shard_count: 200,
|
||||
|
||||
@@ -18,7 +18,6 @@ tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
pq_proto.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -11,7 +11,5 @@ postgres.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -144,7 +144,20 @@ impl PgConnectionConfig {
|
||||
// implement and this function is hardly a bottleneck. The function is only called around
|
||||
// establishing a new connection.
|
||||
#[allow(unstable_name_collisions)]
|
||||
config.options(&encode_options(&self.options));
|
||||
config.options(
|
||||
&self
|
||||
.options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>(),
|
||||
);
|
||||
}
|
||||
config
|
||||
}
|
||||
@@ -165,21 +178,6 @@ impl PgConnectionConfig {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unstable_name_collisions)]
|
||||
fn encode_options(options: &[String]) -> String {
|
||||
options
|
||||
.iter()
|
||||
.map(|s| {
|
||||
if s.contains(['\\', ' ']) {
|
||||
Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
|
||||
} else {
|
||||
Cow::Borrowed(s.as_str())
|
||||
}
|
||||
})
|
||||
.intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
|
||||
.collect::<String>()
|
||||
}
|
||||
|
||||
impl fmt::Display for PgConnectionConfig {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// The password is intentionally hidden and not part of this display string.
|
||||
@@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_pg_connection_config {
|
||||
use crate::{encode_options, PgConnectionConfig};
|
||||
use crate::PgConnectionConfig;
|
||||
use once_cell::sync::Lazy;
|
||||
use url::Host;
|
||||
|
||||
@@ -257,12 +255,18 @@ mod tests_pg_connection_config {
|
||||
|
||||
#[test]
|
||||
fn test_with_options() {
|
||||
let options = encode_options(&[
|
||||
"hello".to_owned(),
|
||||
"world".to_owned(),
|
||||
"with space".to_owned(),
|
||||
"and \\ backslashes".to_owned(),
|
||||
let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
|
||||
"hello",
|
||||
"world",
|
||||
"with space",
|
||||
"and \\ backslashes",
|
||||
]);
|
||||
assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
|
||||
assert_eq!(cfg.host(), &*STUB_HOST);
|
||||
assert_eq!(cfg.port(), 123);
|
||||
assert_eq!(cfg.raw_address(), "stub.host.example:123");
|
||||
assert_eq!(
|
||||
cfg.to_tokio_postgres_config().get_options(),
|
||||
Some("hello world with\\ space and\\ \\\\\\ backslashes")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,6 @@ thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
@@ -136,9 +136,9 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
// Export some version independent functions that are used outside of this mod
|
||||
pub use v14::xlog_utils::encode_logical_message;
|
||||
pub use v14::xlog_utils::from_pg_timestamp;
|
||||
pub use v14::xlog_utils::get_current_timestamp;
|
||||
pub use v14::xlog_utils::to_pg_timestamp;
|
||||
pub use v14::xlog_utils::try_from_pg_timestamp;
|
||||
pub use v14::xlog_utils::XLogFileName;
|
||||
|
||||
pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
||||
|
||||
@@ -135,6 +135,8 @@ pub fn get_current_timestamp() -> TimestampTz {
|
||||
mod timestamp_conversions {
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use super::*;
|
||||
|
||||
const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
|
||||
@@ -154,18 +156,18 @@ mod timestamp_conversions {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
|
||||
pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result<SystemTime> {
|
||||
let time: u64 = time
|
||||
.try_into()
|
||||
.expect("timestamp before millenium (postgres epoch)");
|
||||
.context("timestamp before millenium (postgres epoch)")?;
|
||||
let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_micros(since_unix_epoch))
|
||||
.expect("SystemTime overflow")
|
||||
.context("SystemTime overflow")
|
||||
}
|
||||
}
|
||||
|
||||
pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
|
||||
pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp};
|
||||
|
||||
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
|
||||
// start_lsn must point to some previously known record boundary (beginning of
|
||||
@@ -545,14 +547,14 @@ mod tests {
|
||||
#[test]
|
||||
fn test_ts_conversion() {
|
||||
let now = SystemTime::now();
|
||||
let round_trip = from_pg_timestamp(to_pg_timestamp(now));
|
||||
let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap();
|
||||
|
||||
let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||
let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||
assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
|
||||
|
||||
let now_pg = get_current_timestamp();
|
||||
let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
|
||||
let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap());
|
||||
|
||||
assert_eq!(now_pg, round_trip_pg);
|
||||
}
|
||||
|
||||
@@ -14,8 +14,6 @@ postgres.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
regex.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -11,9 +11,7 @@ itertools.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio = { workspace = true, features = ["io-util"] }
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -32,7 +32,7 @@ scopeguard.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
azure_core.workspace = true
|
||||
azure_identity.workspace = true
|
||||
azure_storage.workspace = true
|
||||
@@ -46,3 +46,4 @@ sync_wrapper = { workspace = true, features = ["futures"] }
|
||||
camino-tempfile.workspace = true
|
||||
test-context.workspace = true
|
||||
rand.workspace = true
|
||||
tokio = { workspace = true, features = ["test-util"] }
|
||||
|
||||
@@ -383,6 +383,48 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let kind = RequestKind::Head;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(key));
|
||||
let properties_future = blob_client.get_properties().into_future();
|
||||
|
||||
let properties_future = tokio::time::timeout(self.timeout, properties_future);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = properties_future => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
if let Ok(inner) = &res {
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, inner, started_at);
|
||||
}
|
||||
|
||||
let data = match res {
|
||||
Ok(Ok(data)) => Ok(data),
|
||||
Ok(Err(sdk)) => Err(to_download_error(sdk)),
|
||||
Err(_timeout) => Err(DownloadError::Timeout),
|
||||
}?;
|
||||
|
||||
let properties = data.blob.properties;
|
||||
Ok(ListingObject {
|
||||
key: key.to_owned(),
|
||||
last_modified: SystemTime::from(properties.last_modified),
|
||||
size: properties.content_length,
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -150,7 +150,7 @@ pub enum ListingMode {
|
||||
NoDelimiter,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
#[derive(PartialEq, Eq, Debug, Clone)]
|
||||
pub struct ListingObject {
|
||||
pub key: RemotePath,
|
||||
pub last_modified: SystemTime,
|
||||
@@ -215,6 +215,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
Ok(combined)
|
||||
}
|
||||
|
||||
/// Obtain metadata information about an object.
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
///
|
||||
/// If the operation fails because of timeout or cancellation, the root cause of the error will be
|
||||
@@ -363,6 +370,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
// See [`RemoteStorage::head_object`].
|
||||
pub async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.head_object(key, cancel).await,
|
||||
Self::AwsS3(s) => s.head_object(key, cancel).await,
|
||||
Self::AzureBlob(s) => s.head_object(key, cancel).await,
|
||||
Self::Unreliable(s) => s.head_object(key, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
@@ -598,6 +619,7 @@ impl ConcurrencyLimiter {
|
||||
RequestKind::Delete => &self.write,
|
||||
RequestKind::Copy => &self.write,
|
||||
RequestKind::TimeTravel => &self.write,
|
||||
RequestKind::Head => &self.read,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -445,6 +445,20 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
_cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let target_file_path = key.with_base(&self.storage_root);
|
||||
let metadata = file_metadata(&target_file_path).await?;
|
||||
Ok(ListingObject {
|
||||
key: key.clone(),
|
||||
last_modified: metadata.modified()?,
|
||||
size: metadata.len(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
|
||||
@@ -13,6 +13,7 @@ pub(crate) enum RequestKind {
|
||||
List = 3,
|
||||
Copy = 4,
|
||||
TimeTravel = 5,
|
||||
Head = 6,
|
||||
}
|
||||
|
||||
use scopeguard::ScopeGuard;
|
||||
@@ -27,6 +28,7 @@ impl RequestKind {
|
||||
List => "list_objects",
|
||||
Copy => "copy_object",
|
||||
TimeTravel => "time_travel_recover",
|
||||
Head => "head_object",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
@@ -34,7 +36,8 @@ impl RequestKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct RequestTyped<C>([C; 6]);
|
||||
const REQUEST_KIND_COUNT: usize = 7;
|
||||
pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(crate) fn get(&self, kind: RequestKind) -> &C {
|
||||
@@ -43,8 +46,8 @@ impl<C> RequestTyped<C> {
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
|
||||
let arr = std::array::from_fn::<C, 6, _>(|index| {
|
||||
let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
|
||||
let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
|
||||
@@ -23,7 +23,7 @@ use aws_config::{
|
||||
use aws_sdk_s3::{
|
||||
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
operation::{get_object::GetObjectError, head_object::HeadObjectError},
|
||||
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
|
||||
Client,
|
||||
};
|
||||
@@ -604,6 +604,78 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<ListingObject, DownloadError> {
|
||||
let kind = RequestKind::Head;
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let head_future = self
|
||||
.client
|
||||
.head_object()
|
||||
.bucket(self.bucket_name())
|
||||
.key(self.relative_path_to_s3_object(key))
|
||||
.send();
|
||||
|
||||
let head_future = tokio::time::timeout(self.timeout, head_future);
|
||||
|
||||
let res = tokio::select! {
|
||||
res = head_future => res,
|
||||
_ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
|
||||
};
|
||||
|
||||
let res = res.map_err(|_e| DownloadError::Timeout)?;
|
||||
|
||||
// do not incl. timeouts as errors in metrics but cancellations
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
crate::metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
let data = match res {
|
||||
Ok(object_output) => object_output,
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
|
||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||
// an error: we expect to sometimes fetch an object and find it missing,
|
||||
// e.g. when probing for timeline indices.
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Ok,
|
||||
started_at,
|
||||
);
|
||||
return Err(DownloadError::NotFound);
|
||||
}
|
||||
Err(e) => {
|
||||
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
|
||||
return Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("s3 head object"),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let (Some(last_modified), Some(size)) = (data.last_modified, data.content_length) else {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
"head_object doesn't contain last_modified or content_length"
|
||||
)))?;
|
||||
};
|
||||
Ok(ListingObject {
|
||||
key: key.to_owned(),
|
||||
last_modified: SystemTime::try_from(last_modified).map_err(|e| {
|
||||
DownloadError::Other(anyhow!("can't convert time '{last_modified}': {e}"))
|
||||
})?,
|
||||
size: size as u64,
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -30,6 +30,7 @@ pub struct UnreliableWrapper {
|
||||
#[derive(Debug, Hash, Eq, PartialEq)]
|
||||
enum RemoteOp {
|
||||
ListPrefixes(Option<RemotePath>),
|
||||
HeadObject(RemotePath),
|
||||
Upload(RemotePath),
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
@@ -137,6 +138,16 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list(prefix, mode, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn head_object(
|
||||
&self,
|
||||
key: &RemotePath,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<crate::ListingObject, DownloadError> {
|
||||
self.attempt(RemoteOp::HeadObject(key.clone()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.head_object(key, cancel).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
|
||||
@@ -9,5 +9,3 @@ serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
const_format.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -9,5 +9,3 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -14,5 +14,3 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -14,7 +14,6 @@ testing = ["fail/failpoints"]
|
||||
arc-swap.workspace = true
|
||||
sentry.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
bytes.workspace = true
|
||||
@@ -26,7 +25,6 @@ hyper = { workspace = true, features = ["full"] }
|
||||
fail.workspace = true
|
||||
futures = { workspace = true}
|
||||
jsonwebtoken.workspace = true
|
||||
leaky-bucket.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
@@ -39,7 +37,7 @@ thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
toml_edit = { workspace = true, features = ["serde"] }
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
@@ -54,7 +52,6 @@ walkdir.workspace = true
|
||||
pq_proto.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
metrics.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
@@ -71,6 +68,7 @@ criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
serde_assert.workspace = true
|
||||
tokio = { workspace = true, features = ["test-util"] }
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
|
||||
280
libs/utils/src/leaky_bucket.rs
Normal file
280
libs/utils/src/leaky_bucket.rs
Normal file
@@ -0,0 +1,280 @@
|
||||
//! This module implements the Generic Cell Rate Algorithm for a simplified
|
||||
//! version of the Leaky Bucket rate limiting system.
|
||||
//!
|
||||
//! # Leaky Bucket
|
||||
//!
|
||||
//! If the bucket is full, no new requests are allowed and are throttled/errored.
|
||||
//! If the bucket is partially full/empty, new requests are added to the bucket in
|
||||
//! terms of "tokens".
|
||||
//!
|
||||
//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate.
|
||||
//!
|
||||
//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second.
|
||||
//!
|
||||
//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm)
|
||||
//!
|
||||
//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires
|
||||
//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time.
|
||||
//!
|
||||
//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach
|
||||
//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`.
|
||||
//!
|
||||
//! Another explaination can be found here: <https://brandur.org/rate-limiting>
|
||||
|
||||
use std::{sync::Mutex, time::Duration};
|
||||
|
||||
use tokio::{sync::Notify, time::Instant};
|
||||
|
||||
pub struct LeakyBucketConfig {
|
||||
/// This is the "time cost" of a single request unit.
|
||||
/// Should loosely represent how long it takes to handle a request unit in active resource time.
|
||||
/// Loosely speaking this is the inverse of the steady-rate requests-per-second
|
||||
pub cost: Duration,
|
||||
|
||||
/// total size of the bucket
|
||||
pub bucket_width: Duration,
|
||||
}
|
||||
|
||||
impl LeakyBucketConfig {
|
||||
pub fn new(rps: f64, bucket_size: f64) -> Self {
|
||||
let cost = Duration::from_secs_f64(rps.recip());
|
||||
let bucket_width = cost.mul_f64(bucket_size);
|
||||
Self { cost, bucket_width }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LeakyBucketState {
|
||||
/// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`.
|
||||
///
|
||||
/// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
|
||||
/// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`.
|
||||
/// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens.
|
||||
/// Draining the bucket will happen naturally as `now` moves forward.
|
||||
///
|
||||
/// Let `n` be some "time cost" for the request,
|
||||
/// If now is after empty_at, the bucket is empty and the empty_at is reset to now,
|
||||
/// If now is within the `bucket window + n`, we are within time budget.
|
||||
/// If now is before the `bucket window + n`, we have run out of budget.
|
||||
///
|
||||
/// This is inspired by the generic cell rate algorithm (GCRA) and works
|
||||
/// exactly the same as a leaky-bucket.
|
||||
pub empty_at: Instant,
|
||||
}
|
||||
|
||||
impl LeakyBucketState {
|
||||
pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self {
|
||||
LeakyBucketState {
|
||||
empty_at: Instant::now() + config.cost.mul_f64(initial_tokens),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn bucket_is_empty(&self, now: Instant) -> bool {
|
||||
// if self.end is after now, the bucket is not empty
|
||||
self.empty_at <= now
|
||||
}
|
||||
|
||||
/// Immediately adds tokens to the bucket, if there is space.
|
||||
///
|
||||
/// In a scenario where you are waiting for available rate,
|
||||
/// rather than just erroring immediately, `started` corresponds to when this waiting started.
|
||||
///
|
||||
/// `n` is the number of tokens that will be filled in the bucket.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// If there is not enough space, no tokens are added. Instead, an error is returned with the time when
|
||||
/// there will be space again.
|
||||
pub fn add_tokens(
|
||||
&mut self,
|
||||
config: &LeakyBucketConfig,
|
||||
started: Instant,
|
||||
n: f64,
|
||||
) -> Result<(), Instant> {
|
||||
let now = Instant::now();
|
||||
|
||||
// invariant: started <= now
|
||||
debug_assert!(started <= now);
|
||||
|
||||
// If the bucket was empty when we started our search,
|
||||
// we should update the `empty_at` value accordingly.
|
||||
// this prevents us from having negative tokens in the bucket.
|
||||
let mut empty_at = self.empty_at;
|
||||
if empty_at < started {
|
||||
empty_at = started;
|
||||
}
|
||||
|
||||
let n = config.cost.mul_f64(n);
|
||||
let new_empty_at = empty_at + n;
|
||||
let allow_at = new_empty_at.checked_sub(config.bucket_width);
|
||||
|
||||
// empty_at
|
||||
// allow_at | new_empty_at
|
||||
// / | /
|
||||
// -------o-[---------o-|--]---------
|
||||
// now1 ^ now2 ^
|
||||
//
|
||||
// at now1, the bucket would be completely filled if we add n tokens.
|
||||
// at now2, the bucket would be partially filled if we add n tokens.
|
||||
|
||||
match allow_at {
|
||||
Some(allow_at) if now < allow_at => Err(allow_at),
|
||||
_ => {
|
||||
self.empty_at = new_empty_at;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RateLimiter {
|
||||
pub config: LeakyBucketConfig,
|
||||
pub state: Mutex<LeakyBucketState>,
|
||||
/// a queue to provide this fair ordering.
|
||||
pub queue: Notify,
|
||||
}
|
||||
|
||||
struct Requeue<'a>(&'a Notify);
|
||||
|
||||
impl Drop for Requeue<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.0.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
impl RateLimiter {
|
||||
pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self {
|
||||
RateLimiter {
|
||||
state: Mutex::new(LeakyBucketState::with_initial_tokens(
|
||||
&config,
|
||||
initial_tokens,
|
||||
)),
|
||||
config,
|
||||
queue: {
|
||||
let queue = Notify::new();
|
||||
queue.notify_one();
|
||||
queue
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn steady_rps(&self) -> f64 {
|
||||
self.config.cost.as_secs_f64().recip()
|
||||
}
|
||||
|
||||
/// returns true if we did throttle
|
||||
pub async fn acquire(&self, count: usize) -> bool {
|
||||
let mut throttled = false;
|
||||
|
||||
let start = tokio::time::Instant::now();
|
||||
|
||||
// wait until we are the first in the queue
|
||||
let mut notified = std::pin::pin!(self.queue.notified());
|
||||
if !notified.as_mut().enable() {
|
||||
throttled = true;
|
||||
notified.await;
|
||||
}
|
||||
|
||||
// notify the next waiter in the queue when we are done.
|
||||
let _guard = Requeue(&self.queue);
|
||||
|
||||
loop {
|
||||
let res = self
|
||||
.state
|
||||
.lock()
|
||||
.unwrap()
|
||||
.add_tokens(&self.config, start, count as f64);
|
||||
match res {
|
||||
Ok(()) => return throttled,
|
||||
Err(ready_at) => {
|
||||
throttled = true;
|
||||
tokio::time::sleep_until(ready_at).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::Instant;
|
||||
|
||||
use super::{LeakyBucketConfig, LeakyBucketState};
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn check() {
|
||||
let config = LeakyBucketConfig {
|
||||
// average 100rps
|
||||
cost: Duration::from_millis(10),
|
||||
// burst up to 100 requests
|
||||
bucket_width: Duration::from_millis(1000),
|
||||
};
|
||||
|
||||
let mut state = LeakyBucketState {
|
||||
empty_at: Instant::now(),
|
||||
};
|
||||
|
||||
// supports burst
|
||||
{
|
||||
// should work for 100 requests this instant
|
||||
for _ in 0..100 {
|
||||
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
||||
}
|
||||
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
||||
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
||||
}
|
||||
|
||||
// doesn't overfill
|
||||
{
|
||||
// after 1s we should have an empty bucket again.
|
||||
tokio::time::advance(Duration::from_secs(1)).await;
|
||||
assert!(state.bucket_is_empty(Instant::now()));
|
||||
|
||||
// after 1s more, we should not over count the tokens and allow more than 200 requests.
|
||||
tokio::time::advance(Duration::from_secs(1)).await;
|
||||
for _ in 0..100 {
|
||||
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
||||
}
|
||||
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
||||
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
||||
}
|
||||
|
||||
// supports sustained rate over a long period
|
||||
{
|
||||
tokio::time::advance(Duration::from_secs(1)).await;
|
||||
|
||||
// should sustain 100rps
|
||||
for _ in 0..2000 {
|
||||
tokio::time::advance(Duration::from_millis(10)).await;
|
||||
state.add_tokens(&config, Instant::now(), 1.0).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// supports requesting more tokens than can be stored in the bucket
|
||||
// we just wait a little bit longer upfront.
|
||||
{
|
||||
// start the bucket completely empty
|
||||
tokio::time::advance(Duration::from_secs(5)).await;
|
||||
assert!(state.bucket_is_empty(Instant::now()));
|
||||
|
||||
// requesting 200 tokens of space should take 200*cost = 2s
|
||||
// but we already have 1s available, so we wait 1s from start.
|
||||
let start = Instant::now();
|
||||
|
||||
let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
|
||||
assert_eq!(ready - Instant::now(), Duration::from_secs(1));
|
||||
|
||||
tokio::time::advance(Duration::from_millis(500)).await;
|
||||
let ready = state.add_tokens(&config, start, 200.0).unwrap_err();
|
||||
assert_eq!(ready - Instant::now(), Duration::from_millis(500));
|
||||
|
||||
tokio::time::advance(Duration::from_millis(500)).await;
|
||||
state.add_tokens(&config, start, 200.0).unwrap();
|
||||
|
||||
// bucket should be completely full now
|
||||
let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err();
|
||||
assert_eq!(ready - Instant::now(), Duration::from_millis(10));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -71,6 +71,7 @@ pub mod postgres_client;
|
||||
|
||||
pub mod tracing_span_assert;
|
||||
|
||||
pub mod leaky_bucket;
|
||||
pub mod rate_limit;
|
||||
|
||||
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
||||
|
||||
@@ -5,6 +5,15 @@ use std::time::{Duration, Instant};
|
||||
pub struct RateLimit {
|
||||
last: Option<Instant>,
|
||||
interval: Duration,
|
||||
dropped: u64,
|
||||
}
|
||||
|
||||
pub struct RateLimitStats(u64);
|
||||
|
||||
impl std::fmt::Display for RateLimitStats {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{} dropped calls", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl RateLimit {
|
||||
@@ -12,20 +21,27 @@ impl RateLimit {
|
||||
Self {
|
||||
last: None,
|
||||
interval,
|
||||
dropped: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Call `f` if the rate limit allows.
|
||||
/// Don't call it otherwise.
|
||||
pub fn call<F: FnOnce()>(&mut self, f: F) {
|
||||
self.call2(|_| f())
|
||||
}
|
||||
|
||||
pub fn call2<F: FnOnce(RateLimitStats)>(&mut self, f: F) {
|
||||
let now = Instant::now();
|
||||
match self.last {
|
||||
Some(last) if now - last <= self.interval => {
|
||||
// ratelimit
|
||||
self.dropped += 1;
|
||||
}
|
||||
_ => {
|
||||
self.last = Some(now);
|
||||
f();
|
||||
f(RateLimitStats(self.dropped));
|
||||
self.dropped = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,8 +9,6 @@ anyhow.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
anyhow.workspace = true
|
||||
bindgen.workspace = true
|
||||
|
||||
@@ -95,6 +95,7 @@ fn main() -> anyhow::Result<()> {
|
||||
.allowlist_var("ERROR")
|
||||
.allowlist_var("FATAL")
|
||||
.allowlist_var("PANIC")
|
||||
.allowlist_var("PG_VERSION_NUM")
|
||||
.allowlist_var("WPEVENT")
|
||||
.allowlist_var("WL_LATCH_SET")
|
||||
.allowlist_var("WL_SOCKET_READABLE")
|
||||
|
||||
@@ -282,7 +282,11 @@ mod tests {
|
||||
use std::cell::UnsafeCell;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
|
||||
use crate::{
|
||||
api_bindings::Level,
|
||||
bindings::{NeonWALReadResult, PG_VERSION_NUM},
|
||||
walproposer::Wrapper,
|
||||
};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
@@ -489,41 +493,79 @@ mod tests {
|
||||
|
||||
let (sender, receiver) = sync_channel(1);
|
||||
|
||||
// Messages definitions are at walproposer.h
|
||||
// xxx: it would be better to extract them from safekeeper crate and
|
||||
// use serialization/deserialization here.
|
||||
let greeting_tag = (b'g' as u64).to_ne_bytes();
|
||||
let proto_version = 2_u32.to_ne_bytes();
|
||||
let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
|
||||
let proposer_id = [0; 16];
|
||||
let system_id = 0_u64.to_ne_bytes();
|
||||
let tenant_id = ttid.tenant_id.as_arr();
|
||||
let timeline_id = ttid.timeline_id.as_arr();
|
||||
let pg_tli = 1_u32.to_ne_bytes();
|
||||
let wal_seg_size = 16777216_u32.to_ne_bytes();
|
||||
let proposer_greeting = [
|
||||
greeting_tag.as_slice(),
|
||||
proto_version.as_slice(),
|
||||
pg_version.as_slice(),
|
||||
proposer_id.as_slice(),
|
||||
system_id.as_slice(),
|
||||
tenant_id.as_slice(),
|
||||
timeline_id.as_slice(),
|
||||
pg_tli.as_slice(),
|
||||
wal_seg_size.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let voting_tag = (b'v' as u64).to_ne_bytes();
|
||||
let vote_request_term = 3_u64.to_ne_bytes();
|
||||
let proposer_id = [0; 16];
|
||||
let vote_request = [
|
||||
voting_tag.as_slice(),
|
||||
vote_request_term.as_slice(),
|
||||
proposer_id.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let acceptor_greeting_term = 2_u64.to_ne_bytes();
|
||||
let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
|
||||
let acceptor_greeting = [
|
||||
greeting_tag.as_slice(),
|
||||
acceptor_greeting_term.as_slice(),
|
||||
acceptor_greeting_node_id.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let vote_response_term = 3_u64.to_ne_bytes();
|
||||
let vote_given = 1_u64.to_ne_bytes();
|
||||
let flush_lsn = 0x539_u64.to_ne_bytes();
|
||||
let truncate_lsn = 0x539_u64.to_ne_bytes();
|
||||
let th_len = 1_u32.to_ne_bytes();
|
||||
let th_term = 2_u64.to_ne_bytes();
|
||||
let th_lsn = 0x539_u64.to_ne_bytes();
|
||||
let timeline_start_lsn = 0x539_u64.to_ne_bytes();
|
||||
let vote_response = [
|
||||
voting_tag.as_slice(),
|
||||
vote_response_term.as_slice(),
|
||||
vote_given.as_slice(),
|
||||
flush_lsn.as_slice(),
|
||||
truncate_lsn.as_slice(),
|
||||
th_len.as_slice(),
|
||||
th_term.as_slice(),
|
||||
th_lsn.as_slice(),
|
||||
timeline_start_lsn.as_slice(),
|
||||
]
|
||||
.concat();
|
||||
|
||||
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
||||
wait_events: Cell::new(WaitEventsData {
|
||||
sk: std::ptr::null_mut(),
|
||||
event_mask: 0,
|
||||
}),
|
||||
expected_messages: vec![
|
||||
// TODO: When updating Postgres versions, this test will cause
|
||||
// problems. Postgres version in message needs updating.
|
||||
//
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
],
|
||||
// VoteRequest(VoteRequest { term: 3 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
expected_messages: vec![proposer_greeting, vote_request],
|
||||
expected_ptr: AtomicUsize::new(0),
|
||||
safekeeper_replies: vec![
|
||||
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
|
||||
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
||||
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
safekeeper_replies: vec![acceptor_greeting, vote_response],
|
||||
replies_ptr: AtomicUsize::new(0),
|
||||
sync_channel: sender,
|
||||
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
|
||||
|
||||
@@ -16,6 +16,7 @@ arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
bit_field.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
@@ -36,7 +37,6 @@ humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
leaky-bucket.workspace = true
|
||||
md5.workspace = true
|
||||
nix.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
@@ -52,6 +52,7 @@ rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
send-future.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
serde_path_to_error.workspace = true
|
||||
|
||||
@@ -4,12 +4,13 @@ use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
@@ -67,12 +68,16 @@ async fn ingest(
|
||||
let layer =
|
||||
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
|
||||
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
|
||||
let data_ser_size = data.serialized_size().unwrap() as usize;
|
||||
let ctx = RequestContext::new(
|
||||
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
||||
pageserver::context::DownloadBehavior::Download,
|
||||
);
|
||||
|
||||
const BATCH_SIZE: usize = 16;
|
||||
let mut batch = Vec::new();
|
||||
|
||||
for i in 0..put_count {
|
||||
lsn += put_size as u64;
|
||||
|
||||
@@ -95,7 +100,17 @@ async fn ingest(
|
||||
}
|
||||
}
|
||||
|
||||
layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
|
||||
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
|
||||
if batch.len() >= BATCH_SIZE {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
}
|
||||
if !batch.is_empty() {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
|
||||
@@ -149,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||
));
|
||||
virtual_file::init(16384, virtual_file::io_engine_for_bench());
|
||||
virtual_file::init(
|
||||
16384,
|
||||
virtual_file::io_engine_for_bench(),
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
{
|
||||
|
||||
@@ -7,7 +7,6 @@ license.workspace = true
|
||||
[dependencies]
|
||||
pageserver_api.workspace = true
|
||||
thiserror.workspace = true
|
||||
async-trait.workspace = true
|
||||
reqwest = { workspace = true, features = [ "stream" ] }
|
||||
utils.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
@@ -419,6 +419,24 @@ impl Client {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn timeline_archival_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
req: &TimelineArchivalConfigRequest,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
self.request(Method::POST, &uri, req)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_detach_ancestor(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -506,6 +524,16 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// Configs io buffer alignment at runtime.
|
||||
pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
|
||||
let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, uri, align)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
use anyhow::Result;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -144,7 +145,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let mut total_delta_layers = 0usize;
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
|
||||
use anyhow::Result;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Subcommand;
|
||||
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::block_io::BlockCursor;
|
||||
@@ -59,7 +60,7 @@ pub(crate) enum LayerCmd {
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
|
||||
page_cache::init(100);
|
||||
let file = VirtualFile::open(path, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
@@ -89,6 +90,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos(), ctx).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
assert!(k.is_i128_representable(), "invalid key: ");
|
||||
}
|
||||
// TODO(chi): special handling for last key?
|
||||
Ok(())
|
||||
@@ -189,7 +191,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
} => {
|
||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
@@ -20,6 +20,7 @@ use clap::{Parser, Subcommand};
|
||||
use index_part::IndexPartCmd;
|
||||
use layers::LayerCmd;
|
||||
use pageserver::{
|
||||
config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
page_cache,
|
||||
task_mgr::TaskKind,
|
||||
@@ -205,7 +206,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
|
||||
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
|
||||
@@ -58,6 +58,11 @@ pub(crate) struct Args {
|
||||
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||
#[clap(long)]
|
||||
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||
|
||||
/// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
|
||||
#[clap(long)]
|
||||
set_io_alignment: Option<usize>,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
@@ -124,6 +129,10 @@ async fn main_impl(
|
||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||
}
|
||||
|
||||
if let Some(align) = args.set_io_alignment {
|
||||
mgmt_api_client.put_io_alignment(align).await?;
|
||||
}
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
|
||||
39
pageserver/src/assert_u64_eq_usize.rs
Normal file
39
pageserver/src/assert_u64_eq_usize.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
|
||||
|
||||
pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
|
||||
if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
|
||||
panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
|
||||
}
|
||||
};
|
||||
|
||||
pub(crate) trait U64IsUsize {
|
||||
fn into_usize(self) -> usize;
|
||||
}
|
||||
|
||||
impl U64IsUsize for u64 {
|
||||
#[inline(always)]
|
||||
fn into_usize(self) -> usize {
|
||||
#[allow(clippy::let_unit_value)]
|
||||
let _ = _ASSERT_U64_EQ_USIZE;
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait UsizeIsU64 {
|
||||
fn into_u64(self) -> u64;
|
||||
}
|
||||
|
||||
impl UsizeIsU64 for usize {
|
||||
#[inline(always)]
|
||||
fn into_u64(self) -> u64 {
|
||||
#[allow(clippy::let_unit_value)]
|
||||
let _ = _ASSERT_U64_EQ_USIZE;
|
||||
self as u64
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn u64_to_usize(x: u64) -> usize {
|
||||
#[allow(clippy::let_unit_value)]
|
||||
let _ = _ASSERT_U64_EQ_USIZE;
|
||||
x as usize
|
||||
}
|
||||
61
pageserver/src/bin/import.rs
Normal file
61
pageserver/src/bin/import.rs
Normal file
@@ -0,0 +1,61 @@
|
||||
use anyhow;
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use pageserver::{pg_import, virtual_file::{self, api::IoEngineKind}};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::logging::{self, LogFormat, TracingErrorLayerEnablement};
|
||||
|
||||
use std::str::FromStr;
|
||||
|
||||
//project_git_version!(GIT_VERSION);
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(
|
||||
//version = GIT_VERSION,
|
||||
about = "Utility to import a Postgres data directory directly into image layers",
|
||||
//long_about = "..."
|
||||
)]
|
||||
struct CliOpts {
|
||||
/// Input Postgres data directory
|
||||
pgdata: Utf8PathBuf,
|
||||
|
||||
/// Path to local dir where the layer files will be stored
|
||||
dest_path: Utf8PathBuf,
|
||||
|
||||
#[arg(long, default_value_t = TenantId::from_str("42424242424242424242424242424242").unwrap())]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long, default_value_t = TimelineId::from_str("42424242424242424242424242424242").unwrap())]
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
virtual_file::init(
|
||||
100,
|
||||
IoEngineKind::StdFs,
|
||||
512,
|
||||
);
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
rt.block_on(async_main(cli))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn async_main(cli: CliOpts) -> anyhow::Result<()> {
|
||||
let mut import = pg_import::PgImportEnv::init(&cli.dest_path, cli.tenant_id, cli.timeline_id).await?;
|
||||
|
||||
import.import_datadir(&cli.pgdata).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -125,18 +125,69 @@ fn main() -> anyhow::Result<()> {
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||
info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
|
||||
|
||||
// The tenants directory contains all the pageserver local disk state.
|
||||
// Create if not exists and make sure all the contents are durable before proceeding.
|
||||
// Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
|
||||
// After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
|
||||
// Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
utils::crashsafe::create_dir_all(conf.tenants_path())
|
||||
.with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
|
||||
{
|
||||
let open = || {
|
||||
nix::dir::Dir::open(
|
||||
tenants_path.as_std_path(),
|
||||
nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
|
||||
nix::sys::stat::Mode::empty(),
|
||||
)
|
||||
};
|
||||
let dirfd = match open() {
|
||||
Ok(dirfd) => dirfd,
|
||||
Err(e) => match e {
|
||||
nix::errno::Errno::ENOENT => {
|
||||
utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
|
||||
format!("Failed to create tenants root dir at '{tenants_path}'")
|
||||
})?;
|
||||
open().context("open tenants dir after creating it")?
|
||||
}
|
||||
e => anyhow::bail!(e),
|
||||
},
|
||||
};
|
||||
|
||||
let started = Instant::now();
|
||||
// Linux guarantees durability for syncfs.
|
||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use std::os::fd::AsRawFd;
|
||||
nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
// macOS is not a production platform for Neon, don't even bother.
|
||||
drop(dirfd);
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
compile_error!("Unsupported OS");
|
||||
}
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"made tenant directory contents durable"
|
||||
);
|
||||
}
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
|
||||
virtual_file::init(
|
||||
conf.max_file_descriptors,
|
||||
conf.virtual_file_io_engine,
|
||||
conf.io_buffer_alignment,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||
|
||||
@@ -31,6 +31,7 @@ use utils::{
|
||||
|
||||
use crate::l0_flush::L0FlushConfig;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -50,7 +51,6 @@ pub mod defaults {
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s";
|
||||
@@ -90,13 +90,14 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
|
||||
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
|
||||
ImageCompressionAlgorithm::Disabled;
|
||||
pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)";
|
||||
|
||||
pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false;
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -291,6 +292,8 @@ pub struct PageServerConf {
|
||||
|
||||
/// Direct IO settings
|
||||
pub virtual_file_direct_io: virtual_file::DirectIoMode,
|
||||
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -395,6 +398,8 @@ struct PageServerConfigBuilder {
|
||||
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
||||
|
||||
virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
|
||||
|
||||
io_buffer_alignment: BuilderValue<usize>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -478,11 +483,12 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes: Set(MaxVectoredReadBytes(
|
||||
NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
|
||||
)),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
|
||||
image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
|
||||
io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -662,6 +668,10 @@ impl PageServerConfigBuilder {
|
||||
self.virtual_file_direct_io = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn io_buffer_alignment(&mut self, value: usize) {
|
||||
self.io_buffer_alignment = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -718,6 +728,7 @@ impl PageServerConfigBuilder {
|
||||
l0_flush,
|
||||
compact_level0_phase1_value_access,
|
||||
virtual_file_direct_io,
|
||||
io_buffer_alignment,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -987,6 +998,9 @@ impl PageServerConf {
|
||||
"virtual_file_direct_io" => {
|
||||
builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
|
||||
}
|
||||
"io_buffer_alignment" => {
|
||||
builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1007,6 +1021,15 @@ impl PageServerConf {
|
||||
|
||||
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
||||
|
||||
IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
|
||||
.map_err(|msg| anyhow::anyhow!("{msg}"))
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"effective checkpoint distance is unsupported: {}",
|
||||
conf.default_tenant_conf.checkpoint_distance
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(conf)
|
||||
}
|
||||
|
||||
@@ -1065,11 +1088,12 @@ impl PageServerConf {
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant"),
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1305,11 +1329,12 @@ background_task_maximum_delay = '334 s'
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1378,11 +1403,12 @@ background_task_maximum_delay = '334 s'
|
||||
NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
|
||||
image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(),
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! Periodically collect consumption metrics for all active tenants
|
||||
//! and push them to a HTTP endpoint.
|
||||
use crate::config::PageServerConf;
|
||||
use crate::consumption_metrics::metrics::MetricsKey;
|
||||
use crate::consumption_metrics::upload::KeyGen as _;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::size::CalculateSyntheticSizeError;
|
||||
@@ -8,6 +10,7 @@ use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
|
||||
use camino::Utf8PathBuf;
|
||||
use consumption_metrics::EventType;
|
||||
use itertools::Itertools as _;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||
use reqwest::Url;
|
||||
@@ -19,9 +22,8 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
|
||||
mod metrics;
|
||||
use crate::consumption_metrics::metrics::MetricsKey;
|
||||
mod disk_cache;
|
||||
mod metrics;
|
||||
mod upload;
|
||||
|
||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
@@ -143,6 +145,12 @@ async fn collect_metrics(
|
||||
// these are point in time, with variable "now"
|
||||
let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await;
|
||||
|
||||
// Pre-generate event idempotency keys, to reuse them across the bucket
|
||||
// and HTTP sinks.
|
||||
let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate())
|
||||
.take(metrics.len())
|
||||
.collect_vec();
|
||||
|
||||
let metrics = Arc::new(metrics);
|
||||
|
||||
// why not race cancellation here? because we are one of the last tasks, and if we are
|
||||
@@ -161,8 +169,14 @@ async fn collect_metrics(
|
||||
}
|
||||
|
||||
if let Some(bucket_client) = &bucket_client {
|
||||
let res =
|
||||
upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await;
|
||||
let res = upload::upload_metrics_bucket(
|
||||
bucket_client,
|
||||
&cancel,
|
||||
&node_id,
|
||||
&metrics,
|
||||
&idempotency_keys,
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
tracing::error!("failed to upload to S3: {e:#}");
|
||||
}
|
||||
@@ -174,9 +188,9 @@ async fn collect_metrics(
|
||||
&client,
|
||||
metric_collection_endpoint,
|
||||
&cancel,
|
||||
&node_id,
|
||||
&metrics,
|
||||
&mut cached_metrics,
|
||||
&idempotency_keys,
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
|
||||
@@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http(
|
||||
client: &reqwest::Client,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
cancel: &CancellationToken,
|
||||
node_id: &str,
|
||||
metrics: &[RawMetric],
|
||||
cached_metrics: &mut Cache,
|
||||
idempotency_keys: &[IdempotencyKey<'_>],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut uploaded = 0;
|
||||
let mut failed = 0;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
|
||||
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys);
|
||||
|
||||
while let Some(res) = iter.next() {
|
||||
let (chunk, body) = res?;
|
||||
@@ -87,6 +87,7 @@ pub(super) async fn upload_metrics_bucket(
|
||||
cancel: &CancellationToken,
|
||||
node_id: &str,
|
||||
metrics: &[RawMetric],
|
||||
idempotency_keys: &[IdempotencyKey<'_>],
|
||||
) -> anyhow::Result<()> {
|
||||
if metrics.is_empty() {
|
||||
// Skip uploads if we have no metrics, so that readers don't have to handle the edge case
|
||||
@@ -106,7 +107,7 @@ pub(super) async fn upload_metrics_bucket(
|
||||
|
||||
// Serialize and write into compressed buffer
|
||||
let started_at = std::time::Instant::now();
|
||||
for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) {
|
||||
for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) {
|
||||
let (_chunk, body) = res?;
|
||||
gzip_writer.write_all(&body).await?;
|
||||
}
|
||||
@@ -134,29 +135,31 @@ pub(super) async fn upload_metrics_bucket(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// The return type is quite ugly, but we gain testability in isolation
|
||||
fn serialize_in_chunks<'a, F>(
|
||||
/// Serializes the input metrics as JSON in chunks of chunk_size. The provided
|
||||
/// idempotency keys are injected into the corresponding metric events (reused
|
||||
/// across different metrics sinks), and must have the same length as input.
|
||||
fn serialize_in_chunks<'a>(
|
||||
chunk_size: usize,
|
||||
input: &'a [RawMetric],
|
||||
factory: F,
|
||||
idempotency_keys: &'a [IdempotencyKey<'a>],
|
||||
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
||||
where
|
||||
F: KeyGen<'a> + 'a,
|
||||
{
|
||||
use bytes::BufMut;
|
||||
|
||||
struct Iter<'a, F> {
|
||||
assert_eq!(input.len(), idempotency_keys.len());
|
||||
|
||||
struct Iter<'a> {
|
||||
inner: std::slice::Chunks<'a, RawMetric>,
|
||||
idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
|
||||
chunk_size: usize,
|
||||
|
||||
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
|
||||
buffer: bytes::BytesMut,
|
||||
// chunk amount of events are reused to produce the serialized document
|
||||
scratch: Vec<Event<Ids, Name>>,
|
||||
factory: F,
|
||||
}
|
||||
|
||||
impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
@@ -167,17 +170,14 @@ where
|
||||
self.scratch.extend(
|
||||
chunk
|
||||
.iter()
|
||||
.map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
|
||||
.zip(&mut self.idempotency_keys)
|
||||
.map(|(raw_metric, key)| raw_metric.as_event(key)),
|
||||
);
|
||||
} else {
|
||||
// next rounds: update_in_place to reuse allocations
|
||||
assert_eq!(self.scratch.len(), self.chunk_size);
|
||||
self.scratch
|
||||
.iter_mut()
|
||||
.zip(chunk.iter())
|
||||
.for_each(|(slot, raw_metric)| {
|
||||
raw_metric.update_in_place(slot, &self.factory.generate())
|
||||
});
|
||||
itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys)
|
||||
.for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key));
|
||||
}
|
||||
|
||||
let res = serde_json::to_writer(
|
||||
@@ -198,18 +198,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
|
||||
impl<'a> ExactSizeIterator for Iter<'a> {}
|
||||
|
||||
let buffer = bytes::BytesMut::new();
|
||||
let inner = input.chunks(chunk_size);
|
||||
let idempotency_keys = idempotency_keys.iter();
|
||||
let scratch = Vec::new();
|
||||
|
||||
Iter {
|
||||
inner,
|
||||
idempotency_keys,
|
||||
chunk_size,
|
||||
buffer,
|
||||
scratch,
|
||||
factory,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -268,7 +269,7 @@ impl RawMetricExt for RawMetric {
|
||||
}
|
||||
}
|
||||
|
||||
trait KeyGen<'a>: Copy {
|
||||
pub(crate) trait KeyGen<'a> {
|
||||
fn generate(&self) -> IdempotencyKey<'a>;
|
||||
}
|
||||
|
||||
@@ -389,7 +390,10 @@ mod tests {
|
||||
let examples = metric_samples();
|
||||
assert!(examples.len() > 1);
|
||||
|
||||
let factory = FixedGen::new(Utc::now(), "1", 42);
|
||||
let now = Utc::now();
|
||||
let idempotency_keys = (0..examples.len())
|
||||
.map(|i| FixedGen::new(now, "1", i as u16).generate())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// need to use Event here because serde_json::Value uses default hashmap, not linked
|
||||
// hashmap
|
||||
@@ -398,13 +402,13 @@ mod tests {
|
||||
events: Vec<Event<Ids, Name>>,
|
||||
}
|
||||
|
||||
let correct = serialize_in_chunks(examples.len(), &examples, factory)
|
||||
let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys)
|
||||
.map(|res| res.unwrap().1)
|
||||
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for chunk_size in 1..examples.len() {
|
||||
let actual = serialize_in_chunks(chunk_size, &examples, factory)
|
||||
let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys)
|
||||
.map(|res| res.unwrap().1)
|
||||
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -105,8 +105,10 @@ pub struct RequestContext {
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
||||
pub enum PageContentKind {
|
||||
Unknown,
|
||||
DeltaLayerSummary,
|
||||
DeltaLayerBtreeNode,
|
||||
DeltaLayerValue,
|
||||
ImageLayerSummary,
|
||||
ImageLayerBtreeNode,
|
||||
ImageLayerValue,
|
||||
InMemoryLayer,
|
||||
|
||||
@@ -141,12 +141,18 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
m.other
|
||||
);
|
||||
|
||||
let az_id = m
|
||||
.other
|
||||
.get("availability_zone_id")
|
||||
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
|
||||
|
||||
Some(NodeRegisterRequest {
|
||||
node_id: conf.id,
|
||||
listen_pg_addr: m.postgres_host,
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
availability_zone_id: az_id,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -318,6 +318,27 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::TimelineArchivalError> for ApiError {
|
||||
fn from(value: crate::tenant::TimelineArchivalError) -> Self {
|
||||
use crate::tenant::TimelineArchivalError::*;
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
||||
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
|
||||
e @ HasArchivedParent(_) => {
|
||||
ApiError::PreconditionFailed(e.to_string().into_boxed_str())
|
||||
}
|
||||
HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
|
||||
format!(
|
||||
"Cannot archive timeline which has non-archived child timelines: {children:?}"
|
||||
)
|
||||
.into_boxed_str(),
|
||||
),
|
||||
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::mgr::DeleteTimelineError::*;
|
||||
@@ -405,6 +426,8 @@ async fn build_timeline_info_common(
|
||||
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
// Report is_archived = false if the timeline is still loading
|
||||
let is_archived = timeline.is_archived().unwrap_or(false);
|
||||
let remote_consistent_lsn_projected = timeline
|
||||
.get_remote_consistent_lsn_projected()
|
||||
.unwrap_or(Lsn(0));
|
||||
@@ -445,6 +468,7 @@ async fn build_timeline_info_common(
|
||||
pg_version: timeline.pg_version,
|
||||
|
||||
state,
|
||||
is_archived,
|
||||
|
||||
walreceiver_status,
|
||||
|
||||
@@ -686,9 +710,7 @@ async fn timeline_archival_config_handler(
|
||||
|
||||
tenant
|
||||
.apply_timeline_archival_config(timeline_id, request_data.state)
|
||||
.await
|
||||
.context("applying archival config")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.await?;
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_archival_config",
|
||||
@@ -852,7 +874,10 @@ async fn get_timestamp_of_lsn_handler(
|
||||
|
||||
match result {
|
||||
Some(time) => {
|
||||
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
||||
let time = format_rfc3339(
|
||||
postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?,
|
||||
)
|
||||
.to_string();
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => Err(ApiError::NotFound(
|
||||
@@ -1706,13 +1731,12 @@ async fn timeline_compact_handler(
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? {
|
||||
if !cfg!(feature = "testing") {
|
||||
return Err(ApiError::InternalServerError(anyhow!(
|
||||
"enhanced_gc_bottom_most_compaction is only available in testing mode"
|
||||
)));
|
||||
}
|
||||
flags |= CompactFlags::EnhancedGcBottomMostCompaction;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
|
||||
flags |= CompactFlags::DryRun;
|
||||
}
|
||||
|
||||
let wait_until_uploaded =
|
||||
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
|
||||
|
||||
@@ -2330,6 +2354,20 @@ async fn put_io_engine_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn put_io_alignment_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let align: usize = json_request(&mut r).await?;
|
||||
crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
|
||||
ApiError::PreconditionFailed(
|
||||
format!("Requested io alignment ({align}) is not a power of two").into(),
|
||||
)
|
||||
})?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Polled by control plane.
|
||||
///
|
||||
/// See [`crate::utilization`].
|
||||
@@ -2942,7 +2980,7 @@ pub fn make_router(
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
|
||||
|r| api_handler(r, timeline_compact_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
||||
@@ -3017,6 +3055,9 @@ pub fn make_router(
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.put("/v1/io_alignment", |r| {
|
||||
api_handler(r, put_io_alignment_handler)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||
|
||||
@@ -1,15 +1,10 @@
|
||||
use std::{num::NonZeroUsize, sync::Arc};
|
||||
|
||||
use crate::tenant::ephemeral_file;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
|
||||
pub enum L0FlushConfig {
|
||||
PageCached,
|
||||
#[serde(rename_all = "snake_case")]
|
||||
Direct {
|
||||
max_concurrency: NonZeroUsize,
|
||||
},
|
||||
Direct { max_concurrency: NonZeroUsize },
|
||||
}
|
||||
|
||||
impl Default for L0FlushConfig {
|
||||
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
pub enum Inner {
|
||||
PageCached,
|
||||
Direct { semaphore: tokio::sync::Semaphore },
|
||||
}
|
||||
|
||||
impl L0FlushGlobalState {
|
||||
pub fn new(config: L0FlushConfig) -> Self {
|
||||
match config {
|
||||
L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
|
||||
L0FlushConfig::Direct { max_concurrency } => {
|
||||
let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
|
||||
Self(Arc::new(Inner::Direct { semaphore }))
|
||||
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl L0FlushConfig {
|
||||
pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
|
||||
use L0FlushConfig::*;
|
||||
match self {
|
||||
PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
|
||||
Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ pub mod l0_flush;
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
pub use pageserver_api::keyspace;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
mod assert_u64_eq_usize;
|
||||
pub mod aux_file;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
@@ -31,6 +32,7 @@ pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
pub mod pg_import;
|
||||
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
@@ -49,7 +51,7 @@ use tracing::{info, info_span};
|
||||
/// backwards-compatible changes to the metadata format.
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
@@ -88,6 +90,8 @@ pub async fn shutdown_pageserver(
|
||||
) {
|
||||
use std::time::Duration;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
// If the orderly shutdown below takes too long, we still want to make
|
||||
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||
//
|
||||
@@ -241,7 +245,10 @@ pub async fn shutdown_pageserver(
|
||||
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||
info!("walredo_extraordinary_shutdown_thread done");
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
info!(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"Shut down successfully completed"
|
||||
);
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
|
||||
@@ -1552,7 +1552,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
||||
pub(crate) enum ComputeCommandKind {
|
||||
PageStreamV2,
|
||||
PageStream,
|
||||
Basebackup,
|
||||
Fullbackup,
|
||||
LeaseLsn,
|
||||
@@ -1803,6 +1802,23 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_utilization_score",
|
||||
"The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_secondary_heatmap_total_size",
|
||||
"The total size in bytes of all layers in the most recently downloaded heatmap.",
|
||||
&["tenant_id", "shard_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
@@ -1853,16 +1869,64 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
pub struct BackgroundLoopSemaphoreMetrics {
|
||||
counters: EnumMap<BackgroundLoopKind, IntCounterPair>,
|
||||
durations: EnumMap<BackgroundLoopKind, Counter>,
|
||||
}
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE: Lazy<BackgroundLoopSemaphoreMetrics> = Lazy::new(
|
||||
|| {
|
||||
let counters = register_int_counter_pair_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let durations = register_counter_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_duration_seconds",
|
||||
"Sum of wall clock time spent waiting on the background loop concurrency-limiting semaphore acquire calls",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
BackgroundLoopSemaphoreMetrics {
|
||||
counters: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
counters.with_label_values(&[kind.into()])
|
||||
})),
|
||||
durations: enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
|
||||
durations.with_label_values(&[kind.into()])
|
||||
})),
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
impl BackgroundLoopSemaphoreMetrics {
|
||||
pub(crate) fn measure_acquisition(&self, task: BackgroundLoopKind) -> impl Drop + '_ {
|
||||
struct Record<'a> {
|
||||
metrics: &'a BackgroundLoopSemaphoreMetrics,
|
||||
task: BackgroundLoopKind,
|
||||
_counter_guard: metrics::IntCounterPairGuard,
|
||||
start: Instant,
|
||||
}
|
||||
impl Drop for Record<'_> {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed().as_secs_f64();
|
||||
self.metrics.durations[self.task].inc_by(elapsed);
|
||||
}
|
||||
}
|
||||
Record {
|
||||
metrics: self,
|
||||
task,
|
||||
_counter_guard: self.counters[task].guard(),
|
||||
start: Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
@@ -2544,6 +2608,7 @@ use std::time::{Duration, Instant};
|
||||
use crate::context::{PageContentKind, RequestContext};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
|
||||
/// Maintain a per timeline gauge in addition to the global gauge.
|
||||
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
|
||||
|
||||
@@ -557,7 +557,7 @@ impl PageServerHandler {
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
_protocol_version: PagestreamProtocolVersion,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
@@ -601,8 +601,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// parse request
|
||||
let neon_fe_msg =
|
||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
|
||||
// invoke handler function
|
||||
let (handler_result, span) = match neon_fe_msg {
|
||||
@@ -754,16 +753,21 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
if request_lsn < **latest_gc_cutoff_lsn {
|
||||
// Check explicitly for INVALID just to get a less scary error message if the
|
||||
// request is obviously bogus
|
||||
return Err(if request_lsn == Lsn::INVALID {
|
||||
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
||||
} else {
|
||||
PageStreamError::BadRequest(format!(
|
||||
let gc_info = &timeline.gc_info.read().unwrap();
|
||||
if !gc_info.leases.contains_key(&request_lsn) {
|
||||
// The requested LSN is below gc cutoff and is not guarded by a lease.
|
||||
|
||||
// Check explicitly for INVALID just to get a less scary error message if the
|
||||
// request is obviously bogus
|
||||
return Err(if request_lsn == Lsn::INVALID {
|
||||
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
||||
} else {
|
||||
PageStreamError::BadRequest(format!(
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
request_lsn, **latest_gc_cutoff_lsn
|
||||
).into())
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||
@@ -790,6 +794,8 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Handles the lsn lease request.
|
||||
/// If a lease cannot be obtained, the client will receive NULL.
|
||||
#[instrument(skip_all, fields(shard_id, %lsn))]
|
||||
async fn handle_make_lsn_lease<IO>(
|
||||
&mut self,
|
||||
@@ -812,19 +818,25 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
|
||||
let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
|
||||
let valid_until = lease
|
||||
.valid_until
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
let lease = timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
|
||||
.inspect_err(|e| {
|
||||
warn!("{e}");
|
||||
})
|
||||
.ok();
|
||||
let valid_until_str = lease.map(|l| {
|
||||
l.valid_until
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.expect("valid_until is earlier than UNIX_EPOCH")
|
||||
.as_millis()
|
||||
.to_string()
|
||||
});
|
||||
let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"valid_until",
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(
|
||||
&valid_until.as_millis().to_be_bytes(),
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
.write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1275,35 +1287,6 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStream)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
PagestreamProtocolVersion::V1,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
|
||||
650
pageserver/src/pg_import.rs
Normal file
650
pageserver/src/pg_import.rs
Normal file
@@ -0,0 +1,650 @@
|
||||
use std::fs::metadata;
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::{key::{rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, DBDIR_KEY}, reltag::RelTag};
|
||||
use postgres_ffi::{pg_constants, relfile_utils::parse_relfilename, ControlFileData, BLCKSZ};
|
||||
use tokio::{io::AsyncRead, task::{self, JoinHandle}};
|
||||
use tracing::debug;
|
||||
use utils::{id::{NodeId, TenantId, TimelineId}, shard::{ShardCount, ShardNumber, TenantShardId}};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::{context::{DownloadBehavior, RequestContext}, pgdatadir_mapping::{DbDirectory, RelDirectory}, task_mgr::TaskKind, tenant::storage_layer::ImageLayerWriter};
|
||||
use crate::pgdatadir_mapping::{SlruSegmentDirectory, TwoPhaseDirectory};
|
||||
use crate::config::PageServerConf;
|
||||
use tokio::io::AsyncReadExt;
|
||||
|
||||
use crate::tenant::storage_layer::PersistentLayerDesc;
|
||||
use utils::generation::Generation;
|
||||
use utils::lsn::Lsn;
|
||||
use crate::tenant::IndexPart;
|
||||
use crate::tenant::metadata::TimelineMetadata;
|
||||
use crate::tenant::remote_timeline_client;
|
||||
use crate::tenant::remote_timeline_client::LayerFileMetadata;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{is_contiguous_range, contiguous_range_len};
|
||||
use pageserver_api::keyspace::singleton_range;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use pageserver_api::key::{slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, TWOPHASEDIR_KEY, CONTROLFILE_KEY, CHECKPOINT_KEY};
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Range;
|
||||
|
||||
pub struct PgImportEnv {
|
||||
conf: &'static PageServerConf,
|
||||
tli: TimelineId,
|
||||
tsi: TenantShardId,
|
||||
|
||||
pgdata_lsn: Lsn,
|
||||
|
||||
tasks: Vec<AnyImportTask>,
|
||||
|
||||
layers: Vec<PersistentLayerDesc>,
|
||||
}
|
||||
|
||||
impl PgImportEnv {
|
||||
|
||||
pub async fn init(dstdir: &Utf8Path, tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<PgImportEnv> {
|
||||
let config = toml_edit::Document::new();
|
||||
let conf = PageServerConf::parse_and_validate(
|
||||
NodeId(42),
|
||||
&config,
|
||||
dstdir
|
||||
)?;
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
|
||||
let tsi = TenantShardId {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
};
|
||||
|
||||
Ok(PgImportEnv {
|
||||
conf,
|
||||
tli: timeline_id,
|
||||
tsi,
|
||||
pgdata_lsn: Lsn(0), // Will be filled in later, when the control file is imported
|
||||
|
||||
tasks: Vec::new(),
|
||||
layers: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn import_datadir(&mut self, pgdata_path: &Utf8PathBuf) -> anyhow::Result<()> {
|
||||
// Read control file
|
||||
let controlfile_path = pgdata_path.join("global").join("pg_control");
|
||||
let controlfile_buf = std::fs::read(&controlfile_path)
|
||||
.with_context(|| format!("reading controlfile: {controlfile_path}"))?;
|
||||
let control_file = ControlFileData::decode(&controlfile_buf)?;
|
||||
|
||||
let pgdata_lsn = Lsn(control_file.checkPoint).align();
|
||||
let timeline_path = self.conf.timeline_path(&self.tsi, &self.tli);
|
||||
|
||||
println!("Importing {pgdata_path} to {timeline_path} as lsn {pgdata_lsn}...");
|
||||
self.pgdata_lsn = pgdata_lsn;
|
||||
|
||||
let datadir = PgDataDir::new(pgdata_path);
|
||||
|
||||
// Import dbdir (00:00:00 keyspace)
|
||||
// This is just constructed here, but will be written to the image layer in the first call to import_db()
|
||||
let dbdir_buf = Bytes::from(DbDirectory::ser(&DbDirectory {
|
||||
dbdirs: datadir.dbs.iter().map(|db| ((db.spcnode, db.dboid), true)).collect(),
|
||||
})?);
|
||||
self.tasks.push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into());
|
||||
|
||||
// Import databases (00:spcnode:dbnode keyspace for each db)
|
||||
for db in datadir.dbs {
|
||||
self.import_db(&db).await?;
|
||||
}
|
||||
|
||||
// Import SLRUs
|
||||
|
||||
// pg_xact (01:00 keyspace)
|
||||
self.import_slru(SlruKind::Clog, &pgdata_path.join("pg_xact")).await?;
|
||||
// pg_multixact/members (01:01 keyspace)
|
||||
self.import_slru(SlruKind::MultiXactMembers, &pgdata_path.join("pg_multixact/members")).await?;
|
||||
// pg_multixact/offsets (01:02 keyspace)
|
||||
self.import_slru(SlruKind::MultiXactOffsets, &pgdata_path.join("pg_multixact/offsets")).await?;
|
||||
|
||||
// Import pg_twophase.
|
||||
// TODO: as empty
|
||||
let twophasedir_buf = TwoPhaseDirectory::ser(
|
||||
&TwoPhaseDirectory { xids: HashSet::new() }
|
||||
)?;
|
||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(TWOPHASEDIR_KEY, Bytes::from(twophasedir_buf))));
|
||||
|
||||
// Controlfile, checkpoint
|
||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(CONTROLFILE_KEY, Bytes::from(controlfile_buf))));
|
||||
|
||||
let checkpoint_buf = control_file.checkPointCopy.encode()?;
|
||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(CHECKPOINT_KEY, checkpoint_buf)));
|
||||
|
||||
// Assigns parts of key space to later parallel jobs
|
||||
let mut last_end_key = Key::MIN;
|
||||
let mut current_chunk = Vec::new();
|
||||
let mut current_chunk_size: usize = 0;
|
||||
let mut parallel_jobs = Vec::new();
|
||||
for task in std::mem::take(&mut self.tasks).into_iter() {
|
||||
if current_chunk_size + task.total_size() > 1024*1024*1024 {
|
||||
let key_range = last_end_key..task.key_range().start;
|
||||
parallel_jobs.push(ChunkProcessingJob::new(
|
||||
key_range.clone(),
|
||||
std::mem::take(&mut current_chunk),
|
||||
self
|
||||
));
|
||||
last_end_key = key_range.end;
|
||||
current_chunk_size = 0;
|
||||
}
|
||||
current_chunk_size += task.total_size();
|
||||
current_chunk.push(task);
|
||||
}
|
||||
parallel_jobs.push(ChunkProcessingJob::new(
|
||||
last_end_key..Key::NON_L0_MAX,
|
||||
current_chunk,
|
||||
self
|
||||
));
|
||||
|
||||
// Start all jobs simultaneosly
|
||||
// TODO: semaphore?
|
||||
let mut handles = vec![];
|
||||
for job in parallel_jobs {
|
||||
let handle: JoinHandle<anyhow::Result<PersistentLayerDesc>> = task::spawn(async move {
|
||||
let layerdesc = job.run().await?;
|
||||
Ok(layerdesc)
|
||||
});
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Wait for all jobs to complete
|
||||
for handle in handles {
|
||||
let layerdesc = handle.await??;
|
||||
self.layers.push(layerdesc);
|
||||
}
|
||||
|
||||
// Create index_part.json file
|
||||
self.create_index_part(&control_file).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn import_db(
|
||||
&mut self,
|
||||
db: &PgDataDirDb,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!(
|
||||
"Importing database (path={}, tablespace={}, dboid={})",
|
||||
db.path, db.spcnode, db.dboid
|
||||
);
|
||||
|
||||
// Import relmap (00:spcnode:dbnode:00:*:00)
|
||||
let relmap_key = relmap_file_key(db.spcnode, db.dboid);
|
||||
debug!("Constructing relmap entry, key {relmap_key}");
|
||||
let mut relmap_file = tokio::fs::File::open(&db.path.join("pg_filenode.map")).await?;
|
||||
let relmap_buf = read_all_bytes(&mut relmap_file).await?;
|
||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(relmap_key, relmap_buf)));
|
||||
|
||||
// Import reldir (00:spcnode:dbnode:00:*:01)
|
||||
let reldir_key = rel_dir_to_key(db.spcnode, db.dboid);
|
||||
debug!("Constructing reldirs entry, key {reldir_key}");
|
||||
let reldir_buf = RelDirectory::ser(&RelDirectory {
|
||||
rels: db.files.iter().map(|f| (f.rel_tag.relnode, f.rel_tag.forknum)).collect(),
|
||||
})?;
|
||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(reldir_key, Bytes::from(reldir_buf))));
|
||||
|
||||
// Import data (00:spcnode:dbnode:reloid:fork:blk) and set sizes for each last
|
||||
// segment in a given relation (00:spcnode:dbnode:reloid:fork:ff)
|
||||
for file in &db.files {
|
||||
let len = metadata(&file.path)?.len() as usize;
|
||||
ensure!(len % 8192 == 0);
|
||||
let start_blk: u32 = file.segno * (1024 * 1024 * 1024 / 8192);
|
||||
let start_key = rel_block_to_key(file.rel_tag, start_blk);
|
||||
let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32);
|
||||
self.tasks.push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new(start_key..end_key, &file.path)));
|
||||
|
||||
// Set relsize for the last segment (00:spcnode:dbnode:reloid:fork:ff)
|
||||
if let Some(nblocks) = file.nblocks {
|
||||
let size_key = rel_size_to_key(file.rel_tag);
|
||||
//debug!("Setting relation size (path={path}, rel_tag={rel_tag}, segno={segno}) to {nblocks}, key {size_key}");
|
||||
let buf = nblocks.to_le_bytes();
|
||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(size_key, Bytes::from(buf.to_vec()))));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn import_slru(
|
||||
&mut self,
|
||||
kind: SlruKind,
|
||||
path: &Utf8PathBuf,
|
||||
) -> anyhow::Result<()> {
|
||||
let segments: Vec<(String, u32)> = WalkDir::new(path)
|
||||
.max_depth(1)
|
||||
.into_iter()
|
||||
.filter_map(|entry| {
|
||||
let entry = entry.ok()?;
|
||||
let filename = entry.file_name();
|
||||
let filename = filename.to_string_lossy();
|
||||
let segno = u32::from_str_radix(&filename, 16).ok()?;
|
||||
Some((filename.to_string(), segno))
|
||||
}).collect();
|
||||
|
||||
// Write SlruDir
|
||||
let slrudir_key = slru_dir_to_key(kind);
|
||||
let segnos: HashSet<u32> = segments.iter().map(|(_path, segno)| { *segno }).collect();
|
||||
let slrudir = SlruSegmentDirectory {
|
||||
segments: segnos,
|
||||
};
|
||||
let slrudir_buf = SlruSegmentDirectory::ser(&slrudir)?;
|
||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(slrudir_key, Bytes::from(slrudir_buf))));
|
||||
|
||||
for (segpath, segno) in segments {
|
||||
// SlruSegBlocks for each segment
|
||||
let p = path.join(Utf8PathBuf::from(segpath));
|
||||
let file_size = std::fs::metadata(&p)?.len();
|
||||
ensure!(file_size % 8192 == 0);
|
||||
let nblocks = u32::try_from(file_size / 8192)?;
|
||||
let start_key = slru_block_to_key(kind, segno, 0);
|
||||
let end_key = slru_block_to_key(kind, segno, nblocks);
|
||||
self.tasks.push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(start_key..end_key, &p)));
|
||||
|
||||
// Followed by SlruSegSize
|
||||
let segsize_key = slru_segment_size_to_key(kind, segno);
|
||||
let segsize_buf = nblocks.to_le_bytes();
|
||||
self.tasks.push(AnyImportTask::SingleKey(ImportSingleKeyTask::new(segsize_key, Bytes::copy_from_slice(&segsize_buf))));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_index_part(&mut self, control_file: &ControlFileData) -> anyhow::Result<()> {
|
||||
let dstdir = &self.conf.workdir;
|
||||
|
||||
let pg_version = match control_file.catalog_version_no {
|
||||
// thesea are from catversion.h
|
||||
202107181 => 14,
|
||||
202209061 => 15,
|
||||
202307071 => 16,
|
||||
catversion => { bail!("unrecognized catalog version {catversion}")},
|
||||
};
|
||||
|
||||
let metadata = TimelineMetadata::new(
|
||||
// FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
|
||||
// checkpoint record, and prev_record_lsn should point to its beginning.
|
||||
// We should read the real end of the record from the WAL, but here we
|
||||
// just fake it.
|
||||
Lsn(self.pgdata_lsn.0 + 8),
|
||||
Some(self.pgdata_lsn),
|
||||
None, // no ancestor
|
||||
Lsn(0),
|
||||
self.pgdata_lsn, // latest_gc_cutoff_lsn
|
||||
self.pgdata_lsn, // initdb_lsn
|
||||
pg_version,
|
||||
);
|
||||
let generation = Generation::none();
|
||||
let mut index_part = IndexPart::empty(metadata);
|
||||
|
||||
for l in self.layers.iter() {
|
||||
let name = l.layer_name();
|
||||
let metadata = LayerFileMetadata::new(l.file_size, generation, ShardIndex::unsharded());
|
||||
if let Some(_) = index_part.layer_metadata.insert(name.clone(), metadata) {
|
||||
bail!("duplicate layer filename {name}");
|
||||
}
|
||||
}
|
||||
|
||||
let data = index_part.to_s3_bytes()?;
|
||||
let path = remote_timeline_client::remote_index_path(&self.tsi, &self.tli, generation);
|
||||
let path = dstdir.join(path.get_path());
|
||||
std::fs::write(&path, data)
|
||||
.context("could not write {path}")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// dbdir iteration tools
|
||||
//
|
||||
|
||||
struct PgDataDir {
|
||||
pub dbs: Vec<PgDataDirDb> // spcnode, dboid, path
|
||||
}
|
||||
|
||||
struct PgDataDirDb {
|
||||
pub spcnode: u32,
|
||||
pub dboid: u32,
|
||||
pub path: Utf8PathBuf,
|
||||
pub files: Vec<PgDataDirDbFile>
|
||||
}
|
||||
|
||||
struct PgDataDirDbFile {
|
||||
pub path: Utf8PathBuf,
|
||||
pub rel_tag: RelTag,
|
||||
pub segno: u32,
|
||||
|
||||
// Cummulative size of the given fork, set only for the last segment of that fork
|
||||
pub nblocks: Option<usize>,
|
||||
}
|
||||
|
||||
impl PgDataDir {
|
||||
fn new(datadir_path: &Utf8PathBuf) -> Self {
|
||||
// Import ordinary databases, DEFAULTTABLESPACE_OID is smaller than GLOBALTABLESPACE_OID, so import them first
|
||||
// Traverse database in increasing oid order
|
||||
let mut databases = WalkDir::new(datadir_path.join("base"))
|
||||
.max_depth(1)
|
||||
.into_iter()
|
||||
.filter_map(|entry| {
|
||||
entry.ok().and_then(|path| {
|
||||
path.file_name().to_string_lossy().parse::<u32>().ok()
|
||||
})
|
||||
})
|
||||
.sorted()
|
||||
.map(|dboid| {
|
||||
PgDataDirDb::new(
|
||||
datadir_path.join("base").join(dboid.to_string()),
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
datadir_path
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// special case for global catalogs
|
||||
databases.push(PgDataDirDb::new(
|
||||
datadir_path.join("global"),
|
||||
postgres_ffi::pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
datadir_path,
|
||||
));
|
||||
|
||||
databases.sort_by_key(|db| (db.spcnode, db.dboid));
|
||||
|
||||
Self {
|
||||
dbs: databases
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PgDataDirDb {
|
||||
fn new(db_path: Utf8PathBuf, spcnode: u32, dboid: u32, datadir_path: &Utf8PathBuf) -> Self {
|
||||
let mut files: Vec<PgDataDirDbFile> = WalkDir::new(&db_path)
|
||||
.min_depth(1)
|
||||
.max_depth(2)
|
||||
.into_iter()
|
||||
.filter_map(|entry| {
|
||||
entry.ok().and_then(|path| {
|
||||
let relfile = path.file_name().to_string_lossy();
|
||||
// returns (relnode, forknum, segno)
|
||||
parse_relfilename(&relfile).ok()
|
||||
})
|
||||
})
|
||||
.sorted()
|
||||
.map(|(relnode, forknum, segno)| {
|
||||
let rel_tag = RelTag {
|
||||
spcnode,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
};
|
||||
|
||||
let path = datadir_path.join(rel_tag.to_segfile_name(segno));
|
||||
let len = metadata(&path).unwrap().len() as usize;
|
||||
assert!(len % BLCKSZ as usize == 0);
|
||||
let nblocks = len / BLCKSZ as usize;
|
||||
|
||||
PgDataDirDbFile {
|
||||
path,
|
||||
rel_tag,
|
||||
segno,
|
||||
nblocks: Some(nblocks), // first non-cummulative sizes
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Set cummulative sizes. Do all of that math here, so that later we could easier
|
||||
// parallelize over segments and know with which segments we need to write relsize
|
||||
// entry.
|
||||
let mut cumulative_nblocks: usize= 0;
|
||||
let mut prev_rel_tag: Option<RelTag> = None;
|
||||
for i in 0..files.len() {
|
||||
if prev_rel_tag == Some(files[i].rel_tag) {
|
||||
cumulative_nblocks += files[i].nblocks.unwrap();
|
||||
} else {
|
||||
cumulative_nblocks = files[i].nblocks.unwrap();
|
||||
}
|
||||
|
||||
files[i].nblocks = if i == files.len() - 1 || files[i+1].rel_tag != files[i].rel_tag {
|
||||
Some(cumulative_nblocks)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
prev_rel_tag = Some(files[i].rel_tag);
|
||||
}
|
||||
|
||||
|
||||
PgDataDirDb {
|
||||
files,
|
||||
path: db_path,
|
||||
spcnode,
|
||||
dboid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> anyhow::Result<Bytes> {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
reader.read_to_end(&mut buf).await?;
|
||||
Ok(Bytes::from(buf))
|
||||
}
|
||||
|
||||
trait ImportTask {
|
||||
fn key_range(&self) -> Range<Key>;
|
||||
|
||||
fn total_size(&self) -> usize {
|
||||
if is_contiguous_range(&self.key_range()) {
|
||||
contiguous_range_len(&self.key_range()) as usize * 8192
|
||||
} else {
|
||||
u32::MAX as usize
|
||||
}
|
||||
}
|
||||
|
||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
struct ImportSingleKeyTask {
|
||||
key: Key,
|
||||
buf: Bytes,
|
||||
}
|
||||
|
||||
impl ImportSingleKeyTask {
|
||||
fn new(key: Key, buf: Bytes) -> Self {
|
||||
ImportSingleKeyTask { key, buf }
|
||||
}
|
||||
}
|
||||
|
||||
impl ImportTask for ImportSingleKeyTask {
|
||||
fn key_range(&self) -> Range<Key> {
|
||||
singleton_range(self.key)
|
||||
}
|
||||
|
||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
layer_writer.put_image(self.key, self.buf, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct ImportRelBlocksTask {
|
||||
key_range: Range<Key>,
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
impl ImportRelBlocksTask {
|
||||
fn new(key_range: Range<Key>, path: &Utf8Path) -> Self {
|
||||
ImportRelBlocksTask {
|
||||
key_range,
|
||||
path: path.into()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ImportTask for ImportRelBlocksTask {
|
||||
fn key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
debug!("Importing relation file {}", self.path);
|
||||
let mut reader = tokio::fs::File::open(&self.path).await?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
|
||||
let (_rel_tag, end_blk) = self.key_range.end.to_rel_block()?;
|
||||
let mut blknum = start_blk;
|
||||
while blknum < end_blk {
|
||||
reader.read_exact(&mut buf).await?;
|
||||
let key = rel_block_to_key(rel_tag.clone(), blknum);
|
||||
layer_writer.put_image(key, Bytes::copy_from_slice(&buf), ctx).await?;
|
||||
blknum += 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct ImportSlruBlocksTask {
|
||||
key_range: Range<Key>,
|
||||
path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
impl ImportSlruBlocksTask {
|
||||
fn new(key_range: Range<Key>, path: &Utf8Path) -> Self {
|
||||
ImportSlruBlocksTask {
|
||||
key_range,
|
||||
path: path.into()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ImportTask for ImportSlruBlocksTask {
|
||||
fn key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
debug!("Importing SLRU segment file {}", self.path);
|
||||
let mut reader = tokio::fs::File::open(&self.path).await
|
||||
.context(format!("opening {}", &self.path))?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
let (kind, segno, start_blk) = self.key_range.start.to_slru_block()?;
|
||||
let (_kind, _segno, end_blk) = self.key_range.end.to_slru_block()?;
|
||||
let mut blknum = start_blk;
|
||||
while blknum < end_blk {
|
||||
reader.read_exact(&mut buf).await?;
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
layer_writer.put_image(key, Bytes::copy_from_slice(&buf), ctx).await?;
|
||||
blknum += 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
enum AnyImportTask {
|
||||
SingleKey(ImportSingleKeyTask),
|
||||
RelBlocks(ImportRelBlocksTask),
|
||||
SlruBlocks(ImportSlruBlocksTask),
|
||||
}
|
||||
|
||||
impl ImportTask for AnyImportTask {
|
||||
fn key_range(&self) -> Range<Key> {
|
||||
match self {
|
||||
Self::SingleKey(t) => t.key_range(),
|
||||
Self::RelBlocks(t) => t.key_range(),
|
||||
Self::SlruBlocks(t) => t.key_range()
|
||||
}
|
||||
}
|
||||
async fn doit(self, layer_writer: &mut ImageLayerWriter, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImportSingleKeyTask> for AnyImportTask {
|
||||
fn from(t: ImportSingleKeyTask) -> Self {
|
||||
Self::SingleKey(t)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImportRelBlocksTask> for AnyImportTask {
|
||||
fn from(t: ImportRelBlocksTask) -> Self {
|
||||
Self::RelBlocks(t)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ImportSlruBlocksTask> for AnyImportTask {
|
||||
fn from(t: ImportSlruBlocksTask) -> Self {
|
||||
Self::SlruBlocks(t)
|
||||
}
|
||||
}
|
||||
|
||||
struct ChunkProcessingJob {
|
||||
range: Range<Key>,
|
||||
tasks: Vec<AnyImportTask>,
|
||||
|
||||
dstdir: Utf8PathBuf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
pgdata_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl ChunkProcessingJob {
|
||||
fn new(range: Range<Key>, tasks: Vec<AnyImportTask>, env: &PgImportEnv) -> Self {
|
||||
assert!(env.pgdata_lsn.is_valid());
|
||||
Self {
|
||||
range,
|
||||
tasks,
|
||||
dstdir: env.conf.workdir.clone(),
|
||||
tenant_id: env.tsi.tenant_id,
|
||||
timeline_id: env.tli,
|
||||
pgdata_lsn: env.pgdata_lsn,
|
||||
}
|
||||
}
|
||||
|
||||
async fn run(self) -> anyhow::Result<PersistentLayerDesc> {
|
||||
let ctx: RequestContext = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
let config = toml_edit::Document::new();
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(PageServerConf::parse_and_validate(
|
||||
NodeId(42),
|
||||
&config,
|
||||
&self.dstdir
|
||||
)?));
|
||||
let tsi = TenantShardId {
|
||||
tenant_id: self.tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
};
|
||||
|
||||
let mut layer = ImageLayerWriter::new(
|
||||
&conf,
|
||||
self.timeline_id,
|
||||
tsi,
|
||||
&self.range,
|
||||
self.pgdata_lsn,
|
||||
&ctx,
|
||||
).await?;
|
||||
|
||||
for task in self.tasks {
|
||||
task.doit(&mut layer, &ctx).await?;
|
||||
}
|
||||
|
||||
let layerdesc = layer.finish_raw(&ctx).await?;
|
||||
Ok(layerdesc)
|
||||
}
|
||||
}
|
||||
@@ -12,15 +12,14 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
|
||||
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
@@ -174,6 +172,7 @@ impl Timeline {
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_directory_entries: Vec::new(),
|
||||
pending_bytes: 0,
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
@@ -727,7 +726,17 @@ impl Timeline {
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
let current_policy = self.last_aux_file_policy.load();
|
||||
match current_policy {
|
||||
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::V1) => {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (policy=V1)");
|
||||
self.list_aux_files_v1(lsn, ctx).await
|
||||
}
|
||||
None => {
|
||||
let res = self.list_aux_files_v1(lsn, ctx).await?;
|
||||
if !res.is_empty() {
|
||||
warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
|
||||
Some(AuxFilePolicy::CrossValidation) => {
|
||||
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
|
||||
@@ -1022,21 +1031,33 @@ pub struct DatadirModification<'a> {
|
||||
// The put-functions add the modifications here, and they are flushed to the
|
||||
// underlying key-value store by the 'finish' function.
|
||||
pending_lsns: Vec<Lsn>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||
pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
|
||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||
pending_nblocks: i64,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
|
||||
/// An **approximation** of how large our EphemeralFile write will be when committed.
|
||||
pending_bytes: usize,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
// When a DatadirModification is committed, we do a monolithic serialization of all its contents. WAL records can
|
||||
// contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
|
||||
// additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
|
||||
pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
|
||||
|
||||
/// Get the current lsn
|
||||
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
pub(crate) fn approx_pending_bytes(&self) -> usize {
|
||||
self.pending_bytes
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
@@ -1576,6 +1597,7 @@ impl<'a> DatadirModification<'a> {
|
||||
if aux_files_key_v1.is_empty() {
|
||||
None
|
||||
} else {
|
||||
warn!("this timeline is using deprecated aux file policy V1");
|
||||
self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
|
||||
Some(AuxFilePolicy::V1)
|
||||
}
|
||||
@@ -1769,21 +1791,30 @@ impl<'a> DatadirModification<'a> {
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||
for (key, values) in self.pending_updates.drain() {
|
||||
for (lsn, value) in values {
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
bail!(
|
||||
"the request contains data not supported by pageserver at TimelineWriter::put: {}", key
|
||||
);
|
||||
}
|
||||
let mut write_batch = Vec::new();
|
||||
for (lsn, value_ser_size, value) in values {
|
||||
if key.is_rel_block_key() || key.is_slru_block_key() {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, lsn, &value, ctx).await?;
|
||||
write_batch.push((key.to_compact(), lsn, value_ser_size, value));
|
||||
} else {
|
||||
retained_pending_updates
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((lsn, value));
|
||||
retained_pending_updates.entry(key).or_default().push((
|
||||
lsn,
|
||||
value_ser_size,
|
||||
value,
|
||||
));
|
||||
}
|
||||
}
|
||||
writer.put_batch(write_batch, ctx).await?;
|
||||
}
|
||||
|
||||
self.pending_updates = retained_pending_updates;
|
||||
self.pending_bytes = 0;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1809,17 +1840,23 @@ impl<'a> DatadirModification<'a> {
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
if !self.pending_updates.is_empty() {
|
||||
// The put_batch call below expects expects the inputs to be sorted by Lsn,
|
||||
// so we do that first.
|
||||
let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
|
||||
self.pending_updates
|
||||
.drain()
|
||||
.map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
|
||||
.kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
|
||||
VecMapOrdering::GreaterOrEqual,
|
||||
);
|
||||
// Ordering: the items in this batch do not need to be in any global order, but values for
|
||||
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
|
||||
// this to do efficient updates to its index.
|
||||
let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
|
||||
.pending_updates
|
||||
.drain()
|
||||
.flat_map(|(key, values)| {
|
||||
values.into_iter().map(move |(lsn, val_ser_size, value)| {
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
bail!("the request contains data not supported by pageserver at TimelineWriter::put: {}", key);
|
||||
}
|
||||
Ok((key.to_compact(), lsn, val_ser_size, value))
|
||||
})
|
||||
})
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
|
||||
writer.put_batch(lsn_ordered_batch, ctx).await?;
|
||||
writer.put_batch(batch, ctx).await?;
|
||||
}
|
||||
|
||||
if !self.pending_deletions.is_empty() {
|
||||
@@ -1844,6 +1881,8 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
self.pending_bytes = 0;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1860,7 +1899,7 @@ impl<'a> DatadirModification<'a> {
|
||||
// Note: we don't check pending_deletions. It is an error to request a
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(values) = self.pending_updates.get(&key) {
|
||||
if let Some((_, value)) = values.last() {
|
||||
if let Some((_, _, value)) = values.last() {
|
||||
return if let Value::Image(img) = value {
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
@@ -1888,13 +1927,17 @@ impl<'a> DatadirModification<'a> {
|
||||
fn put(&mut self, key: Key, val: Value) {
|
||||
let values = self.pending_updates.entry(key).or_default();
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
if let Some((last_lsn, last_value)) = values.last_mut() {
|
||||
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
|
||||
if *last_lsn == self.lsn {
|
||||
*last_value_ser_size = val.serialized_size().unwrap() as usize;
|
||||
*last_value = val;
|
||||
return;
|
||||
}
|
||||
}
|
||||
values.push((self.lsn, val));
|
||||
|
||||
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
||||
self.pending_bytes += val_serialized_size;
|
||||
values.push((self.lsn, val_serialized_size, val));
|
||||
}
|
||||
|
||||
fn delete(&mut self, key_range: Range<Key>) {
|
||||
@@ -1939,23 +1982,23 @@ impl<'a> Version<'a> {
|
||||
//--- Metadata structs stored in key-value pairs in the repository.
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DbDirectory {
|
||||
pub struct DbDirectory {
|
||||
// (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist)
|
||||
dbdirs: HashMap<(Oid, Oid), bool>,
|
||||
pub dbdirs: HashMap<(Oid, Oid), bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct TwoPhaseDirectory {
|
||||
xids: HashSet<TransactionId>,
|
||||
pub(crate) struct TwoPhaseDirectory {
|
||||
pub(crate) xids: HashSet<TransactionId>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct RelDirectory {
|
||||
pub struct RelDirectory {
|
||||
// Set of relations that exist. (relfilenode, forknum)
|
||||
//
|
||||
// TODO: Store it as a btree or radix tree or something else that spans multiple
|
||||
// key-value pairs, if you have a lot of relations
|
||||
rels: HashSet<(Oid, u8)>,
|
||||
pub rels: HashSet<(Oid, u8)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
|
||||
@@ -1979,9 +2022,9 @@ struct RelSizeEntry {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct SlruSegmentDirectory {
|
||||
pub(crate) struct SlruSegmentDirectory {
|
||||
// Set of SLRU segments that exist.
|
||||
segments: HashSet<u32>,
|
||||
pub(crate) segments: HashSet<u32>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
|
||||
@@ -2024,7 +2067,7 @@ mod tests {
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
|
||||
@@ -146,6 +146,12 @@ impl FromStr for TokioRuntimeMode {
|
||||
}
|
||||
}
|
||||
|
||||
static TOKIO_THREAD_STACK_SIZE: Lazy<NonZeroUsize> = Lazy::new(|| {
|
||||
env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE")
|
||||
// the default 2MiB are insufficent, especially in debug mode
|
||||
.unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap())
|
||||
});
|
||||
|
||||
static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
||||
let thread_name = "pageserver-tokio";
|
||||
let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else {
|
||||
@@ -164,6 +170,7 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.thread_name(thread_name)
|
||||
.enable_all()
|
||||
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
||||
.build()
|
||||
.expect("failed to create one single runtime")
|
||||
}
|
||||
@@ -173,6 +180,7 @@ static ONE_RUNTIME: Lazy<Option<tokio::runtime::Runtime>> = Lazy::new(|| {
|
||||
.thread_name(thread_name)
|
||||
.enable_all()
|
||||
.worker_threads(num_workers.get())
|
||||
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
||||
.build()
|
||||
.expect("failed to create one multi-threaded runtime")
|
||||
}
|
||||
@@ -199,6 +207,7 @@ macro_rules! pageserver_runtime {
|
||||
.thread_name($name)
|
||||
.worker_threads(TOKIO_WORKER_THREADS.get())
|
||||
.enable_all()
|
||||
.thread_stack_size(TOKIO_THREAD_STACK_SIZE.get())
|
||||
.build()
|
||||
.expect(std::concat!("Failed to create runtime ", $name))
|
||||
});
|
||||
@@ -393,7 +402,7 @@ struct PageServerTask {
|
||||
|
||||
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
@@ -405,7 +414,7 @@ struct PageServerTask {
|
||||
pub fn spawn<F>(
|
||||
runtime: &tokio::runtime::Handle,
|
||||
kind: TaskKind,
|
||||
tenant_shard_id: Option<TenantShardId>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
name: &str,
|
||||
future: F,
|
||||
@@ -550,7 +559,7 @@ pub async fn shutdown_tasks(
|
||||
let tasks = TASKS.lock().unwrap();
|
||||
for task in tasks.values() {
|
||||
if (kind.is_none() || Some(task.kind) == kind)
|
||||
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
|
||||
&& (tenant_shard_id.is_none() || Some(task.tenant_shard_id) == tenant_shard_id)
|
||||
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||
{
|
||||
task.cancel.cancel();
|
||||
@@ -573,13 +582,8 @@ pub async fn shutdown_tasks(
|
||||
};
|
||||
if let Some(mut join_handle) = join_handle {
|
||||
if log_all {
|
||||
if tenant_shard_id.is_none() {
|
||||
// there are quite few of these
|
||||
info!(name = task.name, kind = ?task_kind, "stopping global task");
|
||||
} else {
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
// warn to catch these in tests; there shouldn't be any
|
||||
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||
.await
|
||||
|
||||
@@ -501,6 +501,42 @@ impl Debug for DeleteTimelineError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub enum TimelineArchivalError {
|
||||
#[error("NotFound")]
|
||||
NotFound,
|
||||
|
||||
#[error("Timeout")]
|
||||
Timeout,
|
||||
|
||||
#[error("ancestor is archived: {}", .0)]
|
||||
HasArchivedParent(TimelineId),
|
||||
|
||||
#[error("HasUnarchivedChildren")]
|
||||
HasUnarchivedChildren(Vec<TimelineId>),
|
||||
|
||||
#[error("Timeline archival is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl Debug for TimelineArchivalError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::NotFound => write!(f, "NotFound"),
|
||||
Self::Timeout => write!(f, "Timeout"),
|
||||
Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
|
||||
Self::HasUnarchivedChildren(c) => {
|
||||
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
|
||||
}
|
||||
Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
|
||||
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum SetStoppingError {
|
||||
AlreadyStopping(completion::Barrier),
|
||||
Broken,
|
||||
@@ -798,7 +834,7 @@ impl Tenant {
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::Attach,
|
||||
Some(tenant_shard_id),
|
||||
tenant_shard_id,
|
||||
None,
|
||||
"attach tenant",
|
||||
async move {
|
||||
@@ -845,6 +881,12 @@ impl Tenant {
|
||||
});
|
||||
};
|
||||
|
||||
// TODO: should also be rejecting tenant conf changes that violate this check.
|
||||
if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut init_order = init_order;
|
||||
// take the completion because initial tenant loading will complete when all of
|
||||
// these tasks complete.
|
||||
@@ -1326,24 +1368,59 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
state: TimelineArchivalState,
|
||||
) -> anyhow::Result<()> {
|
||||
let timeline = self
|
||||
.get_timeline(timeline_id, false)
|
||||
.context("Cannot apply timeline archival config to inexistent timeline")?;
|
||||
) -> Result<(), TimelineArchivalError> {
|
||||
info!("setting timeline archival config");
|
||||
let timeline = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
|
||||
let Some(timeline) = timelines.get(&timeline_id) else {
|
||||
return Err(TimelineArchivalError::NotFound);
|
||||
};
|
||||
|
||||
if state == TimelineArchivalState::Unarchived {
|
||||
if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
|
||||
if ancestor_timeline.is_archived() == Some(true) {
|
||||
return Err(TimelineArchivalError::HasArchivedParent(
|
||||
ancestor_timeline.timeline_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that there are no non-archived child timelines
|
||||
let children: Vec<TimelineId> = timelines
|
||||
.iter()
|
||||
.filter_map(|(id, entry)| {
|
||||
if entry.get_ancestor_timeline_id() != Some(timeline_id) {
|
||||
return None;
|
||||
}
|
||||
if entry.is_archived() == Some(true) {
|
||||
return None;
|
||||
}
|
||||
Some(*id)
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !children.is_empty() && state == TimelineArchivalState::Archived {
|
||||
return Err(TimelineArchivalError::HasUnarchivedChildren(children));
|
||||
}
|
||||
Arc::clone(timeline)
|
||||
};
|
||||
|
||||
let upload_needed = timeline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_timeline_archival_state(state)?;
|
||||
|
||||
if upload_needed {
|
||||
info!("Uploading new state");
|
||||
const MAX_WAIT: Duration = Duration::from_secs(10);
|
||||
let Ok(v) =
|
||||
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
|
||||
else {
|
||||
tracing::warn!("reached timeout for waiting on upload queue");
|
||||
bail!("reached timeout for upload queue flush");
|
||||
return Err(TimelineArchivalError::Timeout);
|
||||
};
|
||||
v?;
|
||||
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -3741,13 +3818,21 @@ impl Tenant {
|
||||
/// less than this (via eviction and on-demand downloads), but this function enables
|
||||
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
|
||||
/// by keeping important things on local disk.
|
||||
///
|
||||
/// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
|
||||
/// than they report here, due to layer eviction. Tenants with many active branches may
|
||||
/// actually use more than they report here.
|
||||
pub(crate) fn local_storage_wanted(&self) -> u64 {
|
||||
let mut wanted = 0;
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
for timeline in timelines.values() {
|
||||
wanted += timeline.metrics.visible_physical_size_gauge.get();
|
||||
}
|
||||
wanted
|
||||
|
||||
// Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
|
||||
// reflects the observation that on tenants with multiple large branches, typically only one
|
||||
// of them is used actively enough to occupy space on disk.
|
||||
timelines
|
||||
.values()
|
||||
.map(|t| t.metrics.visible_physical_size_gauge.get())
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5932,10 +6017,10 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// the default aux file policy to switch is v1 if not set by the admins
|
||||
// the default aux file policy to switch is v2 if not set by the admins
|
||||
assert_eq!(
|
||||
harness.tenant_conf.switch_aux_file_policy,
|
||||
AuxFilePolicy::V1
|
||||
AuxFilePolicy::default_tenant_config()
|
||||
);
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
@@ -5979,8 +6064,8 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
"aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
|
||||
Some(AuxFilePolicy::V2),
|
||||
"aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
|
||||
);
|
||||
|
||||
// we can read everything from the storage
|
||||
@@ -6002,8 +6087,8 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline.last_aux_file_policy.load(),
|
||||
Some(AuxFilePolicy::V1),
|
||||
"keep v1 storage format when new files are written"
|
||||
Some(AuxFilePolicy::V2),
|
||||
"keep v2 storage format when new files are written"
|
||||
);
|
||||
|
||||
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
@@ -6019,7 +6104,7 @@ mod tests {
|
||||
|
||||
// child copies the last flag even if that is not on remote storage yet
|
||||
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
|
||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
|
||||
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
|
||||
|
||||
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
|
||||
assert_eq!(files.get("pg_logical/mappings/test1"), None);
|
||||
@@ -7005,18 +7090,14 @@ mod tests {
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: {
|
||||
let mut key = Key::MAX;
|
||||
key.field6 -= 1;
|
||||
Key::MIN..key
|
||||
},
|
||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
// The delta layer that is cut in the middle
|
||||
// The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(3)..get_key(4),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x41),
|
||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
// The delta3 layer that should not be picked for the compaction
|
||||
@@ -7996,6 +8077,214 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
|
||||
{
|
||||
let harness =
|
||||
TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x38),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
// delta1 and delta 2 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![
|
||||
(Lsn(0x10), tline.timeline_id),
|
||||
(Lsn(0x20), tline.timeline_id),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||
];
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_20 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_10 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), gc_horizon, &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x20), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_20[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x10), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_10[idx]
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let mut dryrun_flags = EnumSet::new();
|
||||
dryrun_flags.insert(CompactFlags::DryRun);
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, dryrun_flags, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
|
||||
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
|
||||
verify_result().await;
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
// compact again
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
|
||||
@@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
|
||||
|
||||
/// The maximum size of blobs we support. The highest few bits
|
||||
/// are reserved for compression and other further uses.
|
||||
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
|
||||
pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
|
||||
|
||||
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
|
||||
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
||||
@@ -326,7 +326,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if len > MAX_SUPPORTED_LEN {
|
||||
if len > MAX_SUPPORTED_BLOB_LEN {
|
||||
return (
|
||||
(
|
||||
io_buf.slice_len(),
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//! Low-level Block-oriented I/O functions
|
||||
//!
|
||||
|
||||
use super::ephemeral_file::EphemeralFile;
|
||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||
@@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
/// Unlike traits, we also support the read function to be async though.
|
||||
pub(crate) enum BlockReaderRef<'a> {
|
||||
FileBlockReader(&'a FileBlockReader<'a>),
|
||||
EphemeralFile(&'a EphemeralFile),
|
||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||
Slice(&'a [u8]),
|
||||
#[cfg(test)]
|
||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||
#[cfg(test)]
|
||||
@@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> {
|
||||
use BlockReaderRef::*;
|
||||
match self {
|
||||
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
||||
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
|
||||
Adapter(r) => r.read_blk(blknum, ctx).await,
|
||||
Slice(s) => Self::read_blk_slice(s, blknum),
|
||||
#[cfg(test)]
|
||||
TestDisk(r) => r.read_blk(blknum),
|
||||
#[cfg(test)]
|
||||
@@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BlockReaderRef<'a> {
|
||||
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
|
||||
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
|
||||
let end = start.checked_add(PAGE_SZ).unwrap();
|
||||
if end > slice.len() {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
format!("slice too short, len={} end={}", slice.len(), end),
|
||||
));
|
||||
}
|
||||
let slice = &slice[start..end];
|
||||
let page_sized: &[u8; PAGE_SZ] = slice
|
||||
.try_into()
|
||||
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
|
||||
Ok(BlockLease::Slice(page_sized))
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||
///
|
||||
|
||||
@@ -1,13 +1,21 @@
|
||||
//! Implementation of append-only file data structure
|
||||
//! used to keep in-memory layers spilled on disk.
|
||||
|
||||
use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
|
||||
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
|
||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||
use crate::virtual_file::owned_buffers_io::write::Buffer;
|
||||
use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
|
||||
use bytes::BytesMut;
|
||||
use camino::Utf8PathBuf;
|
||||
use num_traits::Num;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio_epoll_uring::{BoundedBuf, Slice};
|
||||
use tracing::error;
|
||||
|
||||
use std::io;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
@@ -16,13 +24,17 @@ use utils::id::TimelineId;
|
||||
pub struct EphemeralFile {
|
||||
_tenant_shard_id: TenantShardId,
|
||||
_timeline_id: TimelineId,
|
||||
|
||||
rw: page_caching::RW,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
bytes_written: u64,
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter<
|
||||
BytesMut,
|
||||
size_tracking_writer::Writer<VirtualFile>,
|
||||
>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop)
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
mod page_caching;
|
||||
pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
|
||||
mod zero_padded_read_write;
|
||||
const TAIL_SZ: usize = 64 * 1024;
|
||||
|
||||
impl EphemeralFile {
|
||||
pub async fn create(
|
||||
@@ -52,62 +64,178 @@ impl EphemeralFile {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let prewarm = conf.l0_flush.prewarm_on_write();
|
||||
let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
|
||||
|
||||
Ok(EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
rw: page_caching::RW::new(file, prewarm, gate_guard),
|
||||
page_cache_file_id,
|
||||
bytes_written: 0,
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter::new(
|
||||
size_tracking_writer::Writer::new(file),
|
||||
BytesMut::with_capacity(TAIL_SZ),
|
||||
),
|
||||
_gate_guard: gate_guard,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let path = &self.buffered_writer.as_inner().as_inner().path;
|
||||
let res = std::fs::remove_file(path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!("could not remove ephemeral file '{path}': {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
pub(crate) fn len(&self) -> u64 {
|
||||
self.rw.bytes_written()
|
||||
self.bytes_written
|
||||
}
|
||||
|
||||
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
|
||||
self.rw.page_cache_file_id()
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
/// See [`self::page_caching::RW::load_to_vec`].
|
||||
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||
self.rw.load_to_vec(ctx).await
|
||||
let size = self.len().into_usize();
|
||||
let vec = Vec::with_capacity(size);
|
||||
let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
|
||||
assert_eq!(nread, size);
|
||||
let vec = slice.into_inner();
|
||||
assert_eq!(vec.len(), nread);
|
||||
assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
self.rw.read_blk(blknum, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn write_blob(
|
||||
/// Returns the offset at which the first byte of the input was written, for use
|
||||
/// in constructing indices over the written value.
|
||||
///
|
||||
/// Panics if the write is short because there's no way we can recover from that.
|
||||
/// TODO: make upstack handle this as an error.
|
||||
pub(crate) async fn write_raw(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
) -> std::io::Result<u64> {
|
||||
let pos = self.bytes_written;
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [srcbuf.len() as u8];
|
||||
|
||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
self.rw.write_all_borrowed(&len_buf, ctx).await?;
|
||||
}
|
||||
let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!(
|
||||
"write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
|
||||
srcbuf_len = srcbuf.len(),
|
||||
),
|
||||
)
|
||||
})?;
|
||||
|
||||
// Write the payload
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
||||
let nwritten = self
|
||||
.buffered_writer
|
||||
.write_buffered_borrowed(srcbuf, ctx)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
nwritten,
|
||||
srcbuf.len(),
|
||||
"buffered writer has no short writes"
|
||||
);
|
||||
|
||||
self.bytes_written = new_bytes_written;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
dst: tokio_epoll_uring::Slice<B>,
|
||||
ctx: &'a RequestContext,
|
||||
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
|
||||
let file_size_tracking_writer = self.buffered_writer.as_inner();
|
||||
let flushed_offset = file_size_tracking_writer.bytes_written();
|
||||
|
||||
let buffer = self.buffered_writer.inspect_buffer();
|
||||
let buffered = &buffer[0..buffer.pending()];
|
||||
|
||||
let dst_cap = dst.bytes_total().into_u64();
|
||||
let end = {
|
||||
// saturating_add is correct here because the max file size is u64::MAX, so,
|
||||
// if start + dst.len() > u64::MAX, then we know it will be a short read
|
||||
let mut end: u64 = start.saturating_add(dst_cap);
|
||||
if end > self.bytes_written {
|
||||
end = self.bytes_written;
|
||||
}
|
||||
end
|
||||
};
|
||||
|
||||
// inclusive, exclusive
|
||||
#[derive(Debug)]
|
||||
struct Range<N>(N, N);
|
||||
impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
|
||||
fn len(&self) -> N {
|
||||
if self.0 > self.1 {
|
||||
N::zero()
|
||||
} else {
|
||||
self.1 - self.0
|
||||
}
|
||||
}
|
||||
}
|
||||
let written_range = Range(start, std::cmp::min(end, flushed_offset));
|
||||
let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
|
||||
|
||||
let dst = if written_range.len() > 0 {
|
||||
let file: &VirtualFile = file_size_tracking_writer.as_inner();
|
||||
let bounds = dst.bounds();
|
||||
let slice = file
|
||||
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
|
||||
.await?;
|
||||
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
|
||||
} else {
|
||||
dst
|
||||
};
|
||||
|
||||
let dst = if buffered_range.len() > 0 {
|
||||
let offset_in_buffer = buffered_range
|
||||
.0
|
||||
.checked_sub(flushed_offset)
|
||||
.unwrap()
|
||||
.into_usize();
|
||||
let to_copy =
|
||||
&buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
|
||||
let bounds = dst.bounds();
|
||||
let mut view = dst.slice({
|
||||
let start = written_range.len().into_usize();
|
||||
let end = start
|
||||
.checked_add(buffered_range.len().into_usize())
|
||||
.unwrap();
|
||||
start..end
|
||||
});
|
||||
view.as_mut_rust_slice_full_zeroed()
|
||||
.copy_from_slice(to_copy);
|
||||
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
|
||||
} else {
|
||||
dst
|
||||
};
|
||||
|
||||
// TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
|
||||
|
||||
Ok((dst, (end - start).into_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the given filename look like an ephemeral file?
|
||||
pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
if let Some(rest) = filename.strip_prefix("ephemeral-") {
|
||||
@@ -117,19 +245,13 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
|
||||
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::Rng;
|
||||
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::block_io::BlockReaderRef;
|
||||
use rand::{thread_rng, RngCore};
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -160,69 +282,6 @@ mod tests {
|
||||
Ok((conf, tenant_shard_id, timeline_id, ctx))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let entered = gate.enter().unwrap();
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo", &ctx).await?;
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor()
|
||||
.read_blob(pos_foo, &ctx)
|
||||
.await?
|
||||
.as_slice()
|
||||
);
|
||||
let pos_bar = file.write_blob(b"bar", &ctx).await?;
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor()
|
||||
.read_blob(pos_foo, &ctx)
|
||||
.await?
|
||||
.as_slice()
|
||||
);
|
||||
assert_eq!(
|
||||
b"bar",
|
||||
file.block_cursor()
|
||||
.read_blob(pos_bar, &ctx)
|
||||
.await?
|
||||
.as_slice()
|
||||
);
|
||||
|
||||
let mut blobs = Vec::new();
|
||||
for i in 0..10000 {
|
||||
let data = Vec::from(format!("blob{}", i).as_bytes());
|
||||
let pos = file.write_blob(&data, &ctx).await?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
// also test with a large blobs
|
||||
for i in 0..100 {
|
||||
let data = format!("blob{}", i).as_bytes().repeat(100);
|
||||
let pos = file.write_blob(&data, &ctx).await?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
|
||||
for (pos, expected) in blobs {
|
||||
let actual = cursor.read_blob(pos, &ctx).await?;
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
// Test a large blob that spans multiple pages
|
||||
let mut large_data = vec![0; 20000];
|
||||
thread_rng().fill_bytes(&mut large_data);
|
||||
let pos_large = file.write_blob(&large_data, &ctx).await?;
|
||||
let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
|
||||
assert_eq!(result, large_data);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ephemeral_file_holds_gate_open() {
|
||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
|
||||
@@ -256,4 +315,151 @@ mod tests {
|
||||
.expect("closing completes right away")
|
||||
.expect("closing does not panic");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ephemeral_file_basics() {
|
||||
let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
|
||||
let write_nbytes = cap + cap / 2;
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(write_nbytes)
|
||||
.collect();
|
||||
|
||||
let mut value_offsets = Vec::new();
|
||||
for i in 0..write_nbytes {
|
||||
let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
|
||||
value_offsets.push(off);
|
||||
}
|
||||
|
||||
assert!(file.len() as usize == write_nbytes);
|
||||
for i in 0..write_nbytes {
|
||||
assert_eq!(value_offsets[i], i.into_u64());
|
||||
let buf = Vec::with_capacity(1);
|
||||
let (buf_slice, nread) = file
|
||||
.read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let buf = buf_slice.into_inner();
|
||||
assert_eq!(nread, 1);
|
||||
assert_eq!(&buf, &content[i..i + 1]);
|
||||
}
|
||||
|
||||
let file_contents =
|
||||
std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
|
||||
assert_eq!(file_contents, &content[0..cap]);
|
||||
|
||||
let buffer_contents = file.buffered_writer.inspect_buffer();
|
||||
assert_eq!(buffer_contents, &content[cap..write_nbytes]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_flushes_do_happen() {
|
||||
let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(cap + cap / 2)
|
||||
.collect();
|
||||
|
||||
file.write_raw(&content, &ctx).await.unwrap();
|
||||
|
||||
// assert the state is as this test expects it to be
|
||||
assert_eq!(
|
||||
&file.load_to_vec(&ctx).await.unwrap(),
|
||||
&content[0..cap + cap / 2]
|
||||
);
|
||||
let md = file
|
||||
.buffered_writer
|
||||
.as_inner()
|
||||
.as_inner()
|
||||
.path
|
||||
.metadata()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
md.len(),
|
||||
cap.into_u64(),
|
||||
"buffered writer does one write if we write 1.5x buffer capacity"
|
||||
);
|
||||
assert_eq!(
|
||||
&file.buffered_writer.inspect_buffer()[0..cap / 2],
|
||||
&content[cap..cap + cap / 2]
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_split_across_file_and_buffer() {
|
||||
// This test exercises the logic on the read path that splits the logical read
|
||||
// into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
|
||||
//
|
||||
// This test build on the assertions in test_flushes_do_happen
|
||||
|
||||
let (conf, tenant_id, timeline_id, ctx) =
|
||||
harness("test_read_split_across_file_and_buffer").unwrap();
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(cap + cap / 2)
|
||||
.collect();
|
||||
|
||||
file.write_raw(&content, &ctx).await.unwrap();
|
||||
|
||||
let test_read = |start: usize, len: usize| {
|
||||
let file = &file;
|
||||
let ctx = &ctx;
|
||||
let content = &content;
|
||||
async move {
|
||||
let (buf, nread) = file
|
||||
.read_exact_at_eof_ok(
|
||||
start.into_u64(),
|
||||
Vec::with_capacity(len).slice_full(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(nread, len);
|
||||
assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
|
||||
}
|
||||
};
|
||||
|
||||
// completely within the file range
|
||||
assert!(20 < cap, "test assumption");
|
||||
test_read(10, 10).await;
|
||||
// border onto edge of file
|
||||
test_read(cap - 10, 10).await;
|
||||
// read across file and buffer
|
||||
test_read(cap - 10, 20).await;
|
||||
// stay from start of buffer
|
||||
test_read(cap, 10).await;
|
||||
// completely within buffer
|
||||
test_read(cap + 10, 10).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,290 +0,0 @@
|
||||
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
|
||||
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::{Deref, Range};
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use tracing::*;
|
||||
|
||||
use super::zero_padded_read_write;
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct RW {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
|
||||
/// should we pre-warm the [`crate::page_cache`] with the contents?
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PrewarmOnWrite {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
) -> Self {
|
||||
let page_cache_file_id = page_cache::next_file_id();
|
||||
Self {
|
||||
page_cache_file_id,
|
||||
rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
|
||||
page_cache_file_id,
|
||||
file,
|
||||
prewarm_on_write,
|
||||
)),
|
||||
_gate_guard,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn page_cache_file_id(&self) -> page_cache::FileId {
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
pub(crate) async fn write_all_borrowed(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<usize, io::Error> {
|
||||
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
|
||||
// because Compute is unlikely to access recently written data.
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) fn bytes_written(&self) -> u64 {
|
||||
self.rw.bytes_written()
|
||||
}
|
||||
|
||||
/// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
|
||||
///
|
||||
/// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
|
||||
/// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
|
||||
pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||
// round up to the next PAGE_SZ multiple, required by blob_io
|
||||
let size = {
|
||||
let s = usize::try_from(self.bytes_written()).unwrap();
|
||||
if s % PAGE_SZ == 0 {
|
||||
s
|
||||
} else {
|
||||
s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
|
||||
}
|
||||
};
|
||||
let vec = Vec::with_capacity(size);
|
||||
|
||||
// read from disk what we've already flushed
|
||||
let writer = self.rw.as_writer();
|
||||
let flushed_range = writer.written_range();
|
||||
let mut vec = writer
|
||||
.file
|
||||
.read_exact_at(
|
||||
vec.slice(0..(flushed_range.end - flushed_range.start)),
|
||||
u64::try_from(flushed_range.start).unwrap(),
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
.into_inner();
|
||||
|
||||
// copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
|
||||
let buffered = self.rw.get_tail_zero_padded();
|
||||
vec.extend_from_slice(buffered);
|
||||
assert_eq!(vec.len(), size);
|
||||
assert_eq!(vec.len() % PAGE_SZ, 0);
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
match self.rw.read_blk(blknum).await? {
|
||||
zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
|
||||
let cache = page_cache::get();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.rw.as_writer().file.path,
|
||||
e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = writer
|
||||
.file
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
||||
}
|
||||
}
|
||||
}
|
||||
zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
|
||||
Ok(BlockLease::EphemeralFileMutableTail(buffer))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RW {
|
||||
fn drop(&mut self) {
|
||||
// There might still be pages in the [`crate::page_cache`] for this file.
|
||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
||||
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let res = std::fs::remove_file(&self.rw.as_writer().file.path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.rw.as_writer().file.path,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct PreWarmingWriter {
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
nwritten_blocks: u32,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
}
|
||||
|
||||
impl PreWarmingWriter {
|
||||
fn new(
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
file: VirtualFile,
|
||||
prewarm_on_write: PrewarmOnWrite,
|
||||
) -> Self {
|
||||
Self {
|
||||
prewarm_on_write,
|
||||
nwritten_blocks: 0,
|
||||
page_cache_file_id,
|
||||
file,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the byte range within `file` that has been written though `write_all`.
|
||||
///
|
||||
/// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
|
||||
fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
|
||||
let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
|
||||
struct Wrapper(Range<usize>);
|
||||
impl Deref for Wrapper {
|
||||
type Target = Range<usize>;
|
||||
fn deref(&self) -> &Range<usize> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
Wrapper(0..nwritten_blocks * PAGE_SZ)
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
|
||||
async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: FullSlice<Buf>,
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<(usize, FullSlice<Buf>)> {
|
||||
let buflen = buf.len();
|
||||
assert_eq!(
|
||||
buflen % PAGE_SZ,
|
||||
0,
|
||||
"{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
|
||||
);
|
||||
|
||||
// Do the IO.
|
||||
let buf = match self.file.write_all(buf, ctx).await {
|
||||
(buf, Ok(nwritten)) => {
|
||||
assert_eq!(nwritten, buflen);
|
||||
buf
|
||||
}
|
||||
(_, Err(e)) => {
|
||||
return Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
// order error before path because path is long and error is short
|
||||
format!(
|
||||
"ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
|
||||
self.nwritten_blocks, buflen, e, self.file.path,
|
||||
),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let nblocks = buflen / PAGE_SZ;
|
||||
let nblocks32 = u32::try_from(nblocks).unwrap();
|
||||
|
||||
if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
|
||||
// Pre-warm page cache with the contents.
|
||||
// At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
|
||||
// benefits the code that writes InMemoryLayer=>L0 layers.
|
||||
|
||||
let cache = page_cache::get();
|
||||
static CTX: Lazy<RequestContext> = Lazy::new(|| {
|
||||
RequestContext::new(
|
||||
crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
|
||||
crate::context::DownloadBehavior::Error,
|
||||
)
|
||||
});
|
||||
for blknum_in_buffer in 0..nblocks {
|
||||
let blk_in_buffer =
|
||||
&buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
|
||||
let blknum = self
|
||||
.nwritten_blocks
|
||||
.checked_add(blknum_in_buffer as u32)
|
||||
.unwrap();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
||||
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
||||
}
|
||||
Ok(v) => match v {
|
||||
page_cache::ReadBufResult::Found(_guard) => {
|
||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||
unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
|
||||
and this function takes &mut self, so, no concurrent read_blk is possible");
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(blk_in_buffer);
|
||||
let _ = write_guard.mark_valid();
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
|
||||
Ok((buflen, buf))
|
||||
}
|
||||
}
|
||||
@@ -1,145 +0,0 @@
|
||||
//! The heart of how [`super::EphemeralFile`] does its reads and writes.
|
||||
//!
|
||||
//! # Writes
|
||||
//!
|
||||
//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
|
||||
//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
|
||||
//!
|
||||
//! # Reads
|
||||
//!
|
||||
//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
|
||||
//!
|
||||
//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
|
||||
//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
|
||||
//! if the read is for the prefix that has already been flushed.
|
||||
//!
|
||||
//! # Current Usage
|
||||
//!
|
||||
//! The current user of this module is [`super::page_caching::RW`].
|
||||
|
||||
mod zero_padded;
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
page_cache::PAGE_SZ,
|
||||
virtual_file::owned_buffers_io::{
|
||||
self,
|
||||
write::{Buffer, OwnedAsyncWriter},
|
||||
},
|
||||
};
|
||||
|
||||
const TAIL_SZ: usize = 64 * 1024;
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct RW<W: OwnedAsyncWriter> {
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter<
|
||||
zero_padded::Buffer<TAIL_SZ>,
|
||||
owned_buffers_io::util::size_tracking_writer::Writer<W>,
|
||||
>,
|
||||
}
|
||||
|
||||
pub enum ReadResult<'a, W> {
|
||||
NeedsReadFromWriter { writer: &'a W },
|
||||
ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
|
||||
}
|
||||
|
||||
impl<W> RW<W>
|
||||
where
|
||||
W: OwnedAsyncWriter,
|
||||
{
|
||||
pub fn new(writer: W) -> Self {
|
||||
let bytes_flushed_tracker =
|
||||
owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
|
||||
let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
|
||||
bytes_flushed_tracker,
|
||||
zero_padded::Buffer::default(),
|
||||
);
|
||||
Self { buffered_writer }
|
||||
}
|
||||
|
||||
pub(crate) fn as_writer(&self) -> &W {
|
||||
self.buffered_writer.as_inner().as_inner()
|
||||
}
|
||||
|
||||
pub async fn write_all_borrowed(
|
||||
&mut self,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<usize> {
|
||||
self.buffered_writer.write_buffered_borrowed(buf, ctx).await
|
||||
}
|
||||
|
||||
pub fn bytes_written(&self) -> u64 {
|
||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||
flushed_offset + u64::try_from(buffer.pending()).unwrap()
|
||||
}
|
||||
|
||||
/// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
|
||||
pub fn get_tail_zero_padded(&self) -> &[u8] {
|
||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||
let buffer_written_up_to = buffer.pending();
|
||||
// pad to next page boundary
|
||||
let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
|
||||
buffer_written_up_to
|
||||
} else {
|
||||
buffer_written_up_to
|
||||
.checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
|
||||
.unwrap()
|
||||
};
|
||||
&buffer.as_zero_padded_slice()[0..read_up_to]
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
|
||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||
let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
|
||||
let read_offset = (blknum as u64) * (PAGE_SZ as u64);
|
||||
|
||||
// The trailing page ("block") might only be partially filled,
|
||||
// yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
|
||||
// Moreover, it has to be zero-padded, because when we still had
|
||||
// a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
|
||||
// DeltaLayer probably has the same issue, not sure why it needs no special treatment.
|
||||
// => check here that the read doesn't go beyond this potentially trailing
|
||||
// => the zero-padding is done in the `else` branch below
|
||||
let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
|
||||
buffered_offset / (PAGE_SZ as u64)
|
||||
} else {
|
||||
(buffered_offset / (PAGE_SZ as u64)) + 1
|
||||
};
|
||||
if (blknum as u64) >= blocks_written {
|
||||
return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
|
||||
}
|
||||
|
||||
// assertions for the `if-else` below
|
||||
assert_eq!(
|
||||
flushed_offset % (TAIL_SZ as u64), 0,
|
||||
"we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
|
||||
);
|
||||
assert_eq!(
|
||||
flushed_offset % (PAGE_SZ as u64),
|
||||
0,
|
||||
"the logic below can't handle if the page is spread across the flushed part and the buffer"
|
||||
);
|
||||
|
||||
if read_offset < flushed_offset {
|
||||
assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
|
||||
Ok(ReadResult::NeedsReadFromWriter {
|
||||
writer: self.as_writer(),
|
||||
})
|
||||
} else {
|
||||
let read_offset_in_buffer = read_offset
|
||||
.checked_sub(flushed_offset)
|
||||
.expect("would have taken `if` branch instead of this one");
|
||||
let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
|
||||
let zero_padded_slice = buffer.as_zero_padded_slice();
|
||||
let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
|
||||
Ok(ReadResult::ServedFromZeroPaddedMutableTail {
|
||||
buffer: page
|
||||
.try_into()
|
||||
.expect("the slice above got it as page-size slice"),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,110 +0,0 @@
|
||||
//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
|
||||
//! unwritten range is guaranteed to be zero-initialized.
|
||||
//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
|
||||
//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct Buffer<const N: usize> {
|
||||
allocation: Box<[u8; N]>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl<const N: usize> Default for Buffer<N> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
allocation: Box::new(
|
||||
// SAFETY: zeroed memory is a valid [u8; N]
|
||||
unsafe { MaybeUninit::zeroed().assume_init() },
|
||||
),
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> Buffer<N> {
|
||||
#[inline(always)]
|
||||
fn invariants(&self) {
|
||||
// don't check by default, unoptimized is too expensive even for debug mode
|
||||
if false {
|
||||
debug_assert!(self.written <= N, "{}", self.written);
|
||||
debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_zero_padded_slice(&self) -> &[u8; N] {
|
||||
&self.allocation
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
|
||||
type IoBuf = Self;
|
||||
|
||||
fn cap(&self) -> usize {
|
||||
self.allocation.len()
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, other: &[u8]) {
|
||||
self.invariants();
|
||||
let remaining = self.allocation.len() - self.written;
|
||||
if other.len() > remaining {
|
||||
panic!("calling extend_from_slice() with insufficient remaining capacity");
|
||||
}
|
||||
self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
|
||||
self.written += other.len();
|
||||
self.invariants();
|
||||
}
|
||||
|
||||
fn pending(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
|
||||
fn flush(self) -> FullSlice<Self> {
|
||||
self.invariants();
|
||||
let written = self.written;
|
||||
FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
|
||||
}
|
||||
|
||||
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
|
||||
let Self {
|
||||
mut allocation,
|
||||
written,
|
||||
} = iobuf;
|
||||
allocation[0..written].fill(0);
|
||||
let new = Self {
|
||||
allocation,
|
||||
written: 0,
|
||||
};
|
||||
new.invariants();
|
||||
new
|
||||
}
|
||||
}
|
||||
|
||||
/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
|
||||
/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
|
||||
///
|
||||
/// Remember that bytes_init is generally _not_ a tracker of the amount
|
||||
/// of valid data in the io buffer; we use `Slice` for that.
|
||||
/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
|
||||
///
|
||||
/// SAFETY:
|
||||
///
|
||||
/// The [`Self::allocation`] is stable becauses boxes are stable.
|
||||
/// The memory is zero-initialized, so, bytes_init is always N.
|
||||
unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.allocation.as_ptr()
|
||||
}
|
||||
|
||||
fn bytes_init(&self) -> usize {
|
||||
// Yes, N, not self.written; Read the full comment of this impl block!
|
||||
N
|
||||
}
|
||||
|
||||
fn bytes_total(&self) -> usize {
|
||||
N
|
||||
}
|
||||
}
|
||||
@@ -464,7 +464,7 @@ impl LayerMap {
|
||||
pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
|
||||
|
||||
if Self::is_l0(&layer_desc.key_range) {
|
||||
if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
|
||||
self.l0_delta_layers.push(layer_desc.clone().into());
|
||||
}
|
||||
|
||||
@@ -483,7 +483,7 @@ impl LayerMap {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(layer_desc));
|
||||
let layer_key = layer_desc.key();
|
||||
if Self::is_l0(&layer_desc.key_range) {
|
||||
if Self::is_l0(&layer_desc.key_range, layer_desc.is_delta) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
@@ -600,8 +600,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Check if the key range resembles that of an L0 layer.
|
||||
pub fn is_l0(key_range: &Range<Key>) -> bool {
|
||||
key_range == &(Key::MIN..Key::MAX)
|
||||
pub fn is_l0(key_range: &Range<Key>, is_delta_layer: bool) -> bool {
|
||||
is_delta_layer && key_range == &(Key::MIN..Key::MAX)
|
||||
}
|
||||
|
||||
/// This function determines which layers are counted in `count_deltas`:
|
||||
@@ -628,7 +628,7 @@ impl LayerMap {
|
||||
/// than just the current partition_range.
|
||||
pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
|
||||
// Case 1
|
||||
if !Self::is_l0(&layer.key_range) {
|
||||
if !Self::is_l0(&layer.key_range, layer.is_delta) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -565,7 +565,7 @@ mod tests {
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* TimelineMetadataHeader */
|
||||
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
74, 104, 158, 105, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
/* TimelineMetadataBodyV2 */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
|
||||
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
|
||||
@@ -574,7 +574,7 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
|
||||
0, 0, 0, 15, // pg_version (4 bytes)
|
||||
0, 0, 0, 16, // pg_version (4 bytes)
|
||||
/* padding bytes */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
||||
@@ -1728,7 +1728,7 @@ impl RemoteTimelineClient {
|
||||
task_mgr::spawn(
|
||||
&self.runtime,
|
||||
TaskKind::RemoteUploadTask,
|
||||
Some(self.tenant_shard_id),
|
||||
self.tenant_shard_id,
|
||||
Some(self.timeline_id),
|
||||
"remote upload",
|
||||
async move {
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::{sync::Arc, time::SystemTime};
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
disk_usage_eviction_task::DiskUsageEvictionInfo,
|
||||
metrics::SECONDARY_HEATMAP_TOTAL_SIZE,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
};
|
||||
|
||||
@@ -105,6 +106,9 @@ pub(crate) struct SecondaryTenant {
|
||||
|
||||
// Sum of layer sizes on local disk
|
||||
pub(super) resident_size_metric: UIntGauge,
|
||||
|
||||
// Sum of layer sizes in the most recently downloaded heatmap
|
||||
pub(super) heatmap_total_size_metric: UIntGauge,
|
||||
}
|
||||
|
||||
impl Drop for SecondaryTenant {
|
||||
@@ -112,6 +116,7 @@ impl Drop for SecondaryTenant {
|
||||
let tenant_id = self.tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
|
||||
let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,6 +133,10 @@ impl SecondaryTenant {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id])
|
||||
.unwrap();
|
||||
|
||||
Arc::new(Self {
|
||||
tenant_shard_id,
|
||||
// todo: shall we make this a descendent of the
|
||||
@@ -145,6 +154,7 @@ impl SecondaryTenant {
|
||||
progress: std::sync::Mutex::default(),
|
||||
|
||||
resident_size_metric,
|
||||
heatmap_total_size_metric,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -829,6 +829,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
layers_downloaded: 0,
|
||||
bytes_downloaded: 0,
|
||||
};
|
||||
|
||||
// Also expose heatmap bytes_total as a metric
|
||||
self.secondary_state
|
||||
.heatmap_total_size_metric
|
||||
.set(heatmap_stats.bytes);
|
||||
|
||||
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
|
||||
let mut delete_layers = Vec::new();
|
||||
let mut delete_timelines = Vec::new();
|
||||
|
||||
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(super) timeline_id: TimelineId,
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
|
||||
pub(super) layers: Vec<HeatMapLayer>,
|
||||
pub(crate) layers: Vec<HeatMapLayer>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(super) name: LayerName,
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
pub(crate) name: LayerName,
|
||||
pub(crate) metadata: LayerFileMetadata,
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(super) access_time: SystemTime,
|
||||
|
||||
@@ -2,13 +2,12 @@
|
||||
|
||||
pub mod delta_layer;
|
||||
pub mod image_layer;
|
||||
pub(crate) mod inmemory_layer;
|
||||
pub mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
|
||||
@@ -36,10 +36,11 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
VectoredReadCoalesceMode, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
@@ -64,7 +65,7 @@ use std::os::unix::fs::FileExt;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -224,14 +225,24 @@ pub struct DeltaLayerInner {
|
||||
file: VirtualFile,
|
||||
file_id: FileId,
|
||||
|
||||
#[allow(dead_code)]
|
||||
layer_key_range: Range<Key>,
|
||||
#[allow(dead_code)]
|
||||
layer_lsn_range: Range<Lsn>,
|
||||
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
format!(
|
||||
"delta {}..{} {}..{}",
|
||||
self.key_range().start,
|
||||
self.key_range().end,
|
||||
self.lsn_range().start,
|
||||
self.lsn_range().end
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("DeltaLayerInner")
|
||||
@@ -458,7 +469,7 @@ impl DeltaLayerWriterInner {
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<Buf>, anyhow::Result<()>)
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
Buf: IoBuf + Send,
|
||||
{
|
||||
assert!(
|
||||
self.lsn_range.start <= lsn,
|
||||
@@ -556,7 +567,6 @@ impl DeltaLayerWriterInner {
|
||||
// 5GB limit for objects without multipart upload (which we don't want to use)
|
||||
// Make it a little bit below to account for differing GB units
|
||||
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
|
||||
const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
|
||||
ensure!(
|
||||
metadata.len() <= S3_UPLOAD_LIMIT,
|
||||
"Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
|
||||
@@ -666,7 +676,7 @@ impl DeltaLayerWriter {
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<Buf>, anyhow::Result<()>)
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
Buf: IoBuf + Send,
|
||||
{
|
||||
self.inner
|
||||
.as_mut()
|
||||
@@ -690,12 +700,10 @@ impl DeltaLayerWriter {
|
||||
self.inner.take().unwrap().finish(key_end, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
@@ -872,44 +880,6 @@ impl DeltaLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
let mut result = Vec::new();
|
||||
let mut stream =
|
||||
Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
while let Some(item) = stream.next().await {
|
||||
let (key, lsn, pos) = item?;
|
||||
// TODO: dedup code with get_reconstruct_value
|
||||
// TODO: ctx handling and sharding
|
||||
cursor
|
||||
.read_blob_into_buf(pos.pos(), &mut buf, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to read blob from virtual file {}", self.file.path)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
self.file.path
|
||||
)
|
||||
})?;
|
||||
result.push((key, lsn, val));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn plan_reads<Reader>(
|
||||
keyspace: &KeySpace,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -1195,6 +1165,7 @@ impl DeltaLayerInner {
|
||||
let mut prev: Option<(Key, Lsn, BlobRef)> = None;
|
||||
|
||||
let mut read_builder: Option<VectoredReadBuilder> = None;
|
||||
let read_mode = VectoredReadCoalesceMode::get();
|
||||
|
||||
let max_read_size = self
|
||||
.max_vectored_read_bytes
|
||||
@@ -1243,6 +1214,7 @@ impl DeltaLayerInner {
|
||||
offsets.end.pos(),
|
||||
meta,
|
||||
max_read_size,
|
||||
read_mode,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
@@ -1527,6 +1499,10 @@ pub struct DeltaLayerIterator<'a> {
|
||||
}
|
||||
|
||||
impl<'a> DeltaLayerIterator<'a> {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
self.delta_layer.layer_dbg_info()
|
||||
}
|
||||
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
@@ -2281,7 +2257,7 @@ pub(crate) mod test {
|
||||
// every key should be a batch b/c the value is larger than max_read_size
|
||||
assert_eq!(iter.key_values_batch.len(), 1);
|
||||
} else {
|
||||
assert_eq!(iter.key_values_batch.len(), batch_size);
|
||||
assert!(iter.key_values_batch.len() <= batch_size);
|
||||
}
|
||||
if num_items >= N {
|
||||
break;
|
||||
|
||||
@@ -28,7 +28,7 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::block_io::{BlockBuf, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
@@ -167,6 +167,17 @@ pub struct ImageLayerInner {
|
||||
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
format!(
|
||||
"image {}..{} {}",
|
||||
self.key_range().start,
|
||||
self.key_range().end,
|
||||
self.lsn()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("ImageLayerInner")
|
||||
@@ -442,33 +453,6 @@ impl ImageLayerInner {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
|
||||
pub(super) async fn load_key_values(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
let mut result = Vec::new();
|
||||
let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx));
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let cursor = block_reader.block_cursor();
|
||||
while let Some(item) = stream.next().await {
|
||||
// TODO: dedup code with get_reconstruct_value
|
||||
let (raw_key, offset) = item?;
|
||||
let key = Key::from_slice(&raw_key[..KEY_SIZE]);
|
||||
// TODO: ctx handling and sharding
|
||||
let blob = cursor
|
||||
.read_blob(offset, ctx)
|
||||
.await
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
result.push((key, self.lsn, Value::Image(value)));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Traverse the layer's index to build read operations on the overlap of the input keyspace
|
||||
/// and the keys in this layer.
|
||||
///
|
||||
@@ -700,15 +684,11 @@ struct ImageLayerWriterInner {
|
||||
blob_writer: BlobWriter<false>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
|
||||
#[cfg_attr(not(feature = "testing"), allow(dead_code))]
|
||||
#[cfg(feature = "testing")]
|
||||
last_written_key: Key,
|
||||
}
|
||||
|
||||
impl ImageLayerWriterInner {
|
||||
fn size(&self) -> u64 {
|
||||
self.tree.borrow_writer().size() + self.blob_writer.size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
@@ -763,6 +743,7 @@ impl ImageLayerWriterInner {
|
||||
uncompressed_bytes_eligible: 0,
|
||||
uncompressed_bytes_chosen: 0,
|
||||
num_keys: 0,
|
||||
#[cfg(feature = "testing")]
|
||||
last_written_key: Key::MIN,
|
||||
};
|
||||
|
||||
@@ -815,12 +796,11 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
async fn finish(
|
||||
async fn finish_layer(
|
||||
self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Option<Key>,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
) -> anyhow::Result<PersistentLayerDesc> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -843,13 +823,19 @@ impl ImageLayerWriterInner {
|
||||
res?;
|
||||
}
|
||||
|
||||
let final_key_range = if let Some(end_key) = end_key {
|
||||
self.key_range.start..end_key
|
||||
} else {
|
||||
self.key_range.clone()
|
||||
};
|
||||
|
||||
// Fill in the summary on blk 0
|
||||
let summary = Summary {
|
||||
magic: IMAGE_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenant_id: self.tenant_shard_id.tenant_id,
|
||||
timeline_id: self.timeline_id,
|
||||
key_range: self.key_range.clone(),
|
||||
key_range: final_key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
@@ -870,11 +856,7 @@ impl ImageLayerWriterInner {
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
if let Some(end_key) = end_key {
|
||||
self.key_range.start..end_key
|
||||
} else {
|
||||
self.key_range.clone()
|
||||
},
|
||||
final_key_range,
|
||||
self.lsn,
|
||||
metadata.len(),
|
||||
);
|
||||
@@ -894,8 +876,22 @@ impl ImageLayerWriterInner {
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
|
||||
Ok(desc)
|
||||
}
|
||||
|
||||
async fn finish(
|
||||
self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Option<Key>,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let path = self.path.clone();
|
||||
let conf = self.conf;
|
||||
|
||||
let desc = self.finish_layer(ctx, end_key).await?;
|
||||
|
||||
// FIXME: why not carry the virtualfile here, it supports renaming?
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
let layer = Layer::finish_creating(conf, timeline, desc, &path)?;
|
||||
|
||||
info!("created image layer {}", layer.local_path());
|
||||
|
||||
@@ -963,14 +959,12 @@ impl ImageLayerWriter {
|
||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Estimated size of the image layer.
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
@@ -986,7 +980,32 @@ impl ImageLayerWriter {
|
||||
self.inner.take().unwrap().finish(timeline, ctx, None).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Like finish(), but doesn't create the ResidentLayer struct. This can be used
|
||||
/// by utilities that don't have a full-blown Timeline.
|
||||
pub(crate) async fn finish_raw(
|
||||
mut self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<super::PersistentLayerDesc> {
|
||||
let inner = self.inner.take().unwrap();
|
||||
|
||||
let name = ImageLayerName {
|
||||
key_range: inner.key_range.clone(),
|
||||
lsn: inner.lsn,
|
||||
};
|
||||
|
||||
let temp_path = inner.path.clone();
|
||||
let final_path = inner.conf.timeline_path(&inner.tenant_shard_id, &inner.timeline_id)
|
||||
.join(name.to_string());
|
||||
|
||||
let desc = inner.finish_layer(ctx, None).await?;
|
||||
|
||||
// Rename the file to final name like Layer::finish_creating() does
|
||||
utils::fs_ext::rename_noreplace(temp_path.as_std_path(), final_path.as_std_path())
|
||||
.with_context(|| format!("rename temporary file as {final_path}"))?;
|
||||
|
||||
Ok(desc)
|
||||
}
|
||||
|
||||
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||
pub(super) async fn finish_with_end_key(
|
||||
mut self,
|
||||
@@ -1000,10 +1019,6 @@ impl ImageLayerWriter {
|
||||
.finish(timeline, ctx, Some(end_key))
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
@@ -1024,6 +1039,10 @@ pub struct ImageLayerIterator<'a> {
|
||||
}
|
||||
|
||||
impl<'a> ImageLayerIterator<'a> {
|
||||
pub(crate) fn layer_dbg_info(&self) -> String {
|
||||
self.image_layer.layer_dbg_info()
|
||||
}
|
||||
|
||||
/// Retrieve a batch of key-value pairs into the iterator buffer.
|
||||
async fn next_batch(&mut self) -> anyhow::Result<()> {
|
||||
assert!(self.key_values_batch.is_empty());
|
||||
@@ -1375,7 +1394,7 @@ mod test {
|
||||
// every key should be a batch b/c the value is larger than max_read_size
|
||||
assert_eq!(iter.key_values_batch.len(), 1);
|
||||
} else {
|
||||
assert_eq!(iter.key_values_batch.len(), batch_size);
|
||||
assert!(iter.key_values_batch.len() <= batch_size);
|
||||
}
|
||||
if num_items >= N {
|
||||
break;
|
||||
|
||||
@@ -4,23 +4,23 @@
|
||||
//! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
|
||||
//! its position in the file, is kept in memory, though.
|
||||
//!
|
||||
use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache, walrecord};
|
||||
use anyhow::{anyhow, Result};
|
||||
use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -33,12 +33,14 @@ use std::fmt::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
};
|
||||
|
||||
pub(crate) mod vectored_dio_read;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
|
||||
|
||||
@@ -78,9 +80,9 @@ impl std::fmt::Debug for InMemoryLayer {
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The value is an offset into the
|
||||
/// by block number and LSN. The [`IndexEntry`] is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
|
||||
index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
|
||||
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
@@ -90,6 +92,154 @@ pub struct InMemoryLayerInner {
|
||||
resource_units: GlobalResourceUnits,
|
||||
}
|
||||
|
||||
/// Support the same max blob length as blob_io, because ultimately
|
||||
/// all the InMemoryLayer contents end up being written into a delta layer,
|
||||
/// using the [`crate::tenant::blob_io`].
|
||||
const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
|
||||
const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
|
||||
let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
|
||||
let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
|
||||
assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
|
||||
trailing_ones
|
||||
};
|
||||
|
||||
/// See [`InMemoryLayerInner::index`].
|
||||
///
|
||||
/// For memory efficiency, the data is packed into a u64.
|
||||
///
|
||||
/// Layout:
|
||||
/// - 1 bit: `will_init`
|
||||
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
|
||||
/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct IndexEntry(u64);
|
||||
|
||||
impl IndexEntry {
|
||||
/// See [`Self::MAX_SUPPORTED_POS`].
|
||||
const MAX_SUPPORTED_POS_BITS: usize = {
|
||||
let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
|
||||
if remainder < 32 {
|
||||
panic!("pos can be u32 as per type system, support that");
|
||||
}
|
||||
remainder
|
||||
};
|
||||
/// The maximum supported blob offset that can be represented by [`Self`].
|
||||
/// See also [`Self::validate_checkpoint_distance`].
|
||||
const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
|
||||
|
||||
// Layout
|
||||
const WILL_INIT_RANGE: Range<usize> = 0..1;
|
||||
const LEN_RANGE: Range<usize> =
|
||||
Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
|
||||
const POS_RANGE: Range<usize> =
|
||||
Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
|
||||
const _ASSERT: () = {
|
||||
if Self::POS_RANGE.end != 64 {
|
||||
panic!("we don't want undefined bits for our own sanity")
|
||||
}
|
||||
};
|
||||
|
||||
/// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
|
||||
///
|
||||
/// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
|
||||
/// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
|
||||
///
|
||||
/// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
|
||||
/// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
|
||||
///
|
||||
/// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
|
||||
/// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
|
||||
#[inline(always)]
|
||||
fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
|
||||
let IndexEntryNewArgs {
|
||||
base_offset,
|
||||
batch_offset,
|
||||
len,
|
||||
will_init,
|
||||
} = arg;
|
||||
|
||||
let pos = base_offset
|
||||
.checked_add(batch_offset)
|
||||
.ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
|
||||
|
||||
if pos.into_usize() > Self::MAX_SUPPORTED_POS {
|
||||
anyhow::bail!(
|
||||
"base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
|
||||
max = Self::MAX_SUPPORTED_POS
|
||||
);
|
||||
}
|
||||
|
||||
if len > MAX_SUPPORTED_BLOB_LEN {
|
||||
anyhow::bail!(
|
||||
"len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
|
||||
);
|
||||
}
|
||||
|
||||
let mut data: u64 = 0;
|
||||
use bit_field::BitField;
|
||||
data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
|
||||
data.set_bits(Self::LEN_RANGE, len.into_u64());
|
||||
data.set_bits(Self::POS_RANGE, pos);
|
||||
|
||||
Ok(Self(data))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn unpack(&self) -> IndexEntryUnpacked {
|
||||
use bit_field::BitField;
|
||||
IndexEntryUnpacked {
|
||||
will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
|
||||
len: self.0.get_bits(Self::LEN_RANGE),
|
||||
pos: self.0.get_bits(Self::POS_RANGE),
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`Self::new`].
|
||||
pub(crate) const fn validate_checkpoint_distance(
|
||||
checkpoint_distance: u64,
|
||||
) -> Result<(), &'static str> {
|
||||
if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
|
||||
return Err("exceeds the maximum supported value");
|
||||
}
|
||||
let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
|
||||
if res.is_none() {
|
||||
return Err(
|
||||
"checkpoint distance + max supported blob len overflows in-memory addition",
|
||||
);
|
||||
}
|
||||
|
||||
// NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
|
||||
let res = Self::validate_checkpoint_distance(
|
||||
crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
);
|
||||
if res.is_err() {
|
||||
panic!("default checkpoint distance is valid")
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// Args to [`IndexEntry::new`].
|
||||
#[derive(Clone, Copy)]
|
||||
struct IndexEntryNewArgs {
|
||||
base_offset: u64,
|
||||
batch_offset: u64,
|
||||
len: usize,
|
||||
will_init: bool,
|
||||
}
|
||||
|
||||
/// Unpacked representation of the bitfielded [`IndexEntry`].
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
struct IndexEntryUnpacked {
|
||||
will_init: bool,
|
||||
len: u64,
|
||||
pos: u64,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("InMemoryLayerInner").finish()
|
||||
@@ -249,9 +399,7 @@ impl InMemoryLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let end_str = self.end_lsn_or_max();
|
||||
|
||||
println!(
|
||||
@@ -259,39 +407,6 @@ impl InMemoryLayer {
|
||||
self.timeline_id, self.start_lsn, end_str,
|
||||
);
|
||||
|
||||
if !verbose {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
let mut desc = String::new();
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -311,7 +426,12 @@ impl InMemoryLayer {
|
||||
.build();
|
||||
|
||||
let inner = self.inner.read().await;
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
struct ValueRead {
|
||||
entry_lsn: Lsn,
|
||||
read: vectored_dio_read::LogicalRead<Vec<u8>>,
|
||||
}
|
||||
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
for (key, vec_map) in inner
|
||||
@@ -326,24 +446,62 @@ impl InMemoryLayer {
|
||||
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
|
||||
let buf = reader.read_blob(*pos, &ctx).await;
|
||||
if let Err(e) = buf {
|
||||
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
for (entry_lsn, index_entry) in slice.iter().rev() {
|
||||
let IndexEntryUnpacked {
|
||||
pos,
|
||||
len,
|
||||
will_init,
|
||||
} = index_entry.unpack();
|
||||
reads.entry(key).or_default().push(ValueRead {
|
||||
entry_lsn: *entry_lsn,
|
||||
read: vectored_dio_read::LogicalRead::new(
|
||||
pos,
|
||||
Vec::with_capacity(len as usize),
|
||||
),
|
||||
});
|
||||
if will_init {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let value = Value::des(&buf.unwrap());
|
||||
if let Err(e) = value {
|
||||
// Execute the reads.
|
||||
|
||||
let f = vectored_dio_read::execute(
|
||||
&inner.file,
|
||||
reads
|
||||
.iter()
|
||||
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
|
||||
&ctx,
|
||||
);
|
||||
send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
|
||||
.await;
|
||||
|
||||
// Process results into the reconstruct state
|
||||
'next_key: for (key, value_reads) in reads {
|
||||
for ValueRead { entry_lsn, read } in value_reads {
|
||||
match read.into_result().expect("we run execute() above") {
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
break;
|
||||
continue 'next_key;
|
||||
}
|
||||
Ok(value_buf) => {
|
||||
let value = Value::des(&value_buf);
|
||||
if let Err(e) = value {
|
||||
reconstruct_state
|
||||
.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
continue 'next_key;
|
||||
}
|
||||
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
break;
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
// TODO: metric to see if we fetched more values than necessary
|
||||
continue 'next_key;
|
||||
}
|
||||
|
||||
// process the next value in the next iteration of the loop
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -355,6 +513,68 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Offset of a particular Value within a serialized batch.
|
||||
struct SerializedBatchOffset {
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
// TODO: separate type when we start serde-serializing this value, to avoid coupling
|
||||
// in-memory representation to serialization format.
|
||||
index_entry: IndexEntry,
|
||||
}
|
||||
|
||||
pub struct SerializedBatch {
|
||||
/// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
|
||||
pub(crate) raw: Vec<u8>,
|
||||
|
||||
/// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
|
||||
offsets: Vec<SerializedBatchOffset>,
|
||||
|
||||
/// The highest LSN of any value in the batch
|
||||
pub(crate) max_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl SerializedBatch {
|
||||
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
|
||||
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
|
||||
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
|
||||
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
|
||||
let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
|
||||
|
||||
let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
for (key, lsn, val_ser_size, val) in batch {
|
||||
let relative_off = cursor.position();
|
||||
|
||||
val.ser_into(&mut cursor)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
offsets.push(SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
index_entry: IndexEntry::new(IndexEntryNewArgs {
|
||||
base_offset: 0,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
})
|
||||
.context("higher-level code ensures that values are within supported ranges")?,
|
||||
});
|
||||
max_lsn = std::cmp::max(max_lsn, lsn);
|
||||
}
|
||||
|
||||
let buffer = cursor.into_inner();
|
||||
|
||||
// Assert that we didn't do any extra allocations while building buffer.
|
||||
debug_assert!(buffer.len() <= buffer_size);
|
||||
|
||||
Ok(Self {
|
||||
raw: buffer,
|
||||
offsets,
|
||||
max_lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
|
||||
write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
|
||||
}
|
||||
@@ -415,53 +635,69 @@ impl InMemoryLayer {
|
||||
})
|
||||
}
|
||||
|
||||
// Write operations
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub async fn put_value(
|
||||
/// Write path.
|
||||
///
|
||||
/// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
|
||||
/// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
|
||||
/// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
|
||||
pub async fn put_batch(
|
||||
&self,
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
serialized_batch: SerializedBatch,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
|
||||
}
|
||||
|
||||
async fn put_value_locked(
|
||||
&self,
|
||||
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
let base_offset = inner.file.len();
|
||||
|
||||
let off = {
|
||||
locked_inner
|
||||
.file
|
||||
.write_blob(
|
||||
buf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build(),
|
||||
)
|
||||
.await?
|
||||
};
|
||||
let SerializedBatch {
|
||||
raw,
|
||||
mut offsets,
|
||||
max_lsn: _,
|
||||
} = serialized_batch;
|
||||
|
||||
let vec_map = locked_inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
// Add the base_offset to the batch's index entries which are relative to the batch start.
|
||||
for offset in &mut offsets {
|
||||
let IndexEntryUnpacked {
|
||||
will_init,
|
||||
len,
|
||||
pos,
|
||||
} = offset.index_entry.unpack();
|
||||
offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
|
||||
base_offset,
|
||||
batch_offset: pos,
|
||||
len: len.into_usize(),
|
||||
will_init,
|
||||
})?;
|
||||
}
|
||||
|
||||
let size = locked_inner.file.len();
|
||||
locked_inner.resource_units.maybe_publish_size(size);
|
||||
// Write the batch to the file
|
||||
inner.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = inner.file.len();
|
||||
let expected_new_len = base_offset
|
||||
.checked_add(raw.len().into_u64())
|
||||
// write_raw would error if we were to overflow u64.
|
||||
// also IndexEntry and higher levels in
|
||||
//the code don't allow the file to grow that large
|
||||
.unwrap();
|
||||
assert_eq!(new_size, expected_new_len);
|
||||
|
||||
// Update the index with the new entries
|
||||
for SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
index_entry,
|
||||
} in offsets
|
||||
{
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
inner.resource_units.maybe_publish_size(new_size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -505,7 +741,7 @@ impl InMemoryLayer {
|
||||
{
|
||||
let inner = self.inner.write().await;
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
for (lsn, _) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
}
|
||||
@@ -536,7 +772,6 @@ impl InMemoryLayer {
|
||||
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match l0_flush_global_state {
|
||||
Inner::PageCached => None,
|
||||
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||
};
|
||||
|
||||
@@ -568,66 +803,25 @@ impl InMemoryLayer {
|
||||
.await?;
|
||||
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::PageCached => {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let (tmp, res) = delta_layer_writer
|
||||
.put_value_bytes(
|
||||
Key::from_compact(*key),
|
||||
*lsn,
|
||||
buf.slice_len(),
|
||||
will_init,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
res?;
|
||||
buf = tmp.into_raw_slice().into_inner();
|
||||
}
|
||||
}
|
||||
}
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||
assert_eq!(
|
||||
file_contents.len() % PAGE_SZ,
|
||||
0,
|
||||
"needed by BlockReaderRef::Slice"
|
||||
);
|
||||
assert_eq!(file_contents.len(), {
|
||||
let written = usize::try_from(inner.file.len()).unwrap();
|
||||
if written % PAGE_SZ == 0 {
|
||||
written
|
||||
} else {
|
||||
written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
|
||||
}
|
||||
});
|
||||
|
||||
let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let file_contents = Bytes::from(file_contents);
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
// TODO: once we have blob lengths in the in-memory index, we can
|
||||
// 1. get rid of the blob_io / BlockReaderRef::Slice business and
|
||||
// 2. load the file contents into a Bytes and
|
||||
// 3. the use `Bytes::slice` to get the `buf` that is our blob
|
||||
// 4. pass that `buf` into `put_value_bytes`
|
||||
// => https://github.com/neondatabase/neon/issues/8183
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let (tmp, res) = delta_layer_writer
|
||||
for (lsn, entry) in vec_map
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(lsn, entry)| (lsn, entry.unpack()))
|
||||
{
|
||||
let IndexEntryUnpacked {
|
||||
pos,
|
||||
len,
|
||||
will_init,
|
||||
} = entry;
|
||||
let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
|
||||
let (_buf, res) = delta_layer_writer
|
||||
.put_value_bytes(
|
||||
Key::from_compact(*key),
|
||||
*lsn,
|
||||
@@ -637,7 +831,6 @@ impl InMemoryLayer {
|
||||
)
|
||||
.await;
|
||||
res?;
|
||||
buf = tmp.into_raw_slice().into_inner();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -659,3 +852,134 @@ impl InMemoryLayer {
|
||||
Ok(Some((desc, path)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_index_entry() {
|
||||
const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
|
||||
use IndexEntryNewArgs as Args;
|
||||
use IndexEntryUnpacked as Unpacked;
|
||||
|
||||
let roundtrip = |args, expect: Unpacked| {
|
||||
let res = IndexEntry::new(args).expect("this tests expects no errors");
|
||||
let IndexEntryUnpacked {
|
||||
will_init,
|
||||
len,
|
||||
pos,
|
||||
} = res.unpack();
|
||||
assert_eq!(will_init, expect.will_init);
|
||||
assert_eq!(len, expect.len);
|
||||
assert_eq!(pos, expect.pos);
|
||||
};
|
||||
|
||||
// basic roundtrip
|
||||
for pos in [0, MAX_SUPPORTED_POS] {
|
||||
for len in [0, MAX_SUPPORTED_BLOB_LEN] {
|
||||
for will_init in [true, false] {
|
||||
let expect = Unpacked {
|
||||
will_init,
|
||||
len: len.into_u64(),
|
||||
pos: pos.into_u64(),
|
||||
};
|
||||
roundtrip(
|
||||
Args {
|
||||
will_init,
|
||||
base_offset: pos.into_u64(),
|
||||
batch_offset: 0,
|
||||
len,
|
||||
},
|
||||
expect,
|
||||
);
|
||||
roundtrip(
|
||||
Args {
|
||||
will_init,
|
||||
base_offset: 0,
|
||||
batch_offset: pos.into_u64(),
|
||||
len,
|
||||
},
|
||||
expect,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// too-large len
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: MAX_SUPPORTED_BLOB_LEN + 1,
|
||||
base_offset: 0,
|
||||
batch_offset: 0,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
|
||||
// too-large pos
|
||||
{
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: 0,
|
||||
base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
|
||||
batch_offset: 0,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: 0,
|
||||
base_offset: 0,
|
||||
batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
}
|
||||
|
||||
// too large (base_offset + batch_offset)
|
||||
{
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: 0,
|
||||
base_offset: MAX_SUPPORTED_POS.into_u64(),
|
||||
batch_offset: 1,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: 0,
|
||||
base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
|
||||
batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
}
|
||||
|
||||
// valid special cases
|
||||
// - area past the max supported pos that is accessible by len
|
||||
for len in [1, MAX_SUPPORTED_BLOB_LEN] {
|
||||
roundtrip(
|
||||
Args {
|
||||
will_init: false,
|
||||
len,
|
||||
base_offset: MAX_SUPPORTED_POS.into_u64(),
|
||||
batch_offset: 0,
|
||||
},
|
||||
Unpacked {
|
||||
will_init: false,
|
||||
len: len as u64,
|
||||
pos: MAX_SUPPORTED_POS.into_u64(),
|
||||
},
|
||||
);
|
||||
roundtrip(
|
||||
Args {
|
||||
will_init: false,
|
||||
len,
|
||||
base_offset: 0,
|
||||
batch_offset: MAX_SUPPORTED_POS.into_u64(),
|
||||
},
|
||||
Unpacked {
|
||||
will_init: false,
|
||||
len: len as u64,
|
||||
pos: MAX_SUPPORTED_POS.into_u64(),
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user