mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-15 20:20:38 +00:00
Compare commits
9 Commits
alek/ololo
...
alek_targz
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4af6a4d5e8 | ||
|
|
b27fa34c00 | ||
|
|
ca22453627 | ||
|
|
0a00869615 | ||
|
|
87eead5220 | ||
|
|
3cf83014d4 | ||
|
|
353a735acb | ||
|
|
107ebd3d21 | ||
|
|
89c93457f3 |
@@ -12,11 +12,6 @@ opt-level = 3
|
||||
# Turn on a small amount of optimization in Development mode.
|
||||
opt-level = 1
|
||||
|
||||
[build]
|
||||
# This is only present for local builds, as it will be overridden
|
||||
# by the RUSTDOCFLAGS env var in CI.
|
||||
rustdocflags = ["-Arustdoc::private_intra_doc_links"]
|
||||
|
||||
[alias]
|
||||
build_testing = ["build", "--features", "testing"]
|
||||
neon = ["run", "--bin", "neon_local"]
|
||||
|
||||
@@ -21,5 +21,4 @@
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!scripts/combine_control_files.py
|
||||
!vm-cgconfig.conf
|
||||
|
||||
@@ -2,12 +2,6 @@ name: 'Create Allure report'
|
||||
description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
|
||||
|
||||
outputs:
|
||||
base-url:
|
||||
description: 'Base URL for Allure report'
|
||||
value: ${{ steps.generate-report.outputs.base-url }}
|
||||
base-s3-url:
|
||||
description: 'Base S3 URL for Allure report'
|
||||
value: ${{ steps.generate-report.outputs.base-s3-url }}
|
||||
report-url:
|
||||
description: 'Allure report URL'
|
||||
value: ${{ steps.generate-report.outputs.report-url }}
|
||||
@@ -69,8 +63,8 @@ runs:
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.23.1
|
||||
ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
|
||||
ALLURE_VERSION: 2.22.1
|
||||
ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
|
||||
|
||||
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
|
||||
- name: Acquire lock
|
||||
@@ -108,11 +102,6 @@ runs:
|
||||
REPORT_PREFIX=reports/${BRANCH_OR_PR}
|
||||
RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
|
||||
|
||||
BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}
|
||||
BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}
|
||||
REPORT_URL=${BASE_URL}/index.html
|
||||
REPORT_JSON_URL=${BASE_URL}/data/suites.json
|
||||
|
||||
# Get previously uploaded data for this run
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
@@ -121,9 +110,10 @@ runs:
|
||||
# There's no previously uploaded data for this $GITHUB_RUN_ID
|
||||
exit 0
|
||||
fi
|
||||
for S3_FILEPATH in ${S3_FILEPATHS}; do
|
||||
time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"
|
||||
|
||||
time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/"
|
||||
for archive in $(find ${WORKDIR} -name "*.tar.zst"); do
|
||||
archive=${WORKDIR}/$(basename $S3_FILEPATH)
|
||||
mkdir -p ${archive%.tar.zst}
|
||||
time tar -xf ${archive} -C ${archive%.tar.zst}
|
||||
rm -f ${archive}
|
||||
@@ -139,9 +129,10 @@ runs:
|
||||
sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js
|
||||
|
||||
# Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
|
||||
# Use sync for the final report to delete files from previous runs
|
||||
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
|
||||
time aws s3 sync --delete --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ${WORKDIR}/index.html
|
||||
@@ -153,10 +144,8 @@ runs:
|
||||
EOF
|
||||
time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
|
||||
echo "base-url=${BASE_URL}" >> $GITHUB_OUTPUT
|
||||
echo "base-s3-url=${BASE_S3_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
|
||||
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
|
||||
|
||||
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
||||
|
||||
|
||||
2
.github/actions/download/action.yml
vendored
2
.github/actions/download/action.yml
vendored
@@ -31,7 +31,7 @@ runs:
|
||||
BUCKET=neon-github-public-dev
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
|
||||
echo 'SKIPPED=true' >> $GITHUB_OUTPUT
|
||||
|
||||
10
.github/actions/run-python-test-set/action.yml
vendored
10
.github/actions/run-python-test-set/action.yml
vendored
@@ -150,14 +150,6 @@ runs:
|
||||
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
# We use pytest-split plugin to run benchmarks in parallel on different CI runners
|
||||
if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
|
||||
mkdir -p $TEST_OUTPUT
|
||||
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
|
||||
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
@@ -209,4 +201,4 @@ runs:
|
||||
uses: ./.github/actions/allure-report-store
|
||||
with:
|
||||
report-dir: /tmp/test_output/allure/results
|
||||
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
|
||||
unique-key: ${{ inputs.build_type }}
|
||||
|
||||
55
.github/workflows/approved-for-ci-run.yml
vendored
55
.github/workflows/approved-for-ci-run.yml
vendored
@@ -1,55 +0,0 @@
|
||||
name: Handle `approved-for-ci-run` label
|
||||
# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
# Default types that triggers a workflow ([1]):
|
||||
# - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
# Types that we wand to handle in addition to keep labels tidy:
|
||||
- closed
|
||||
# Actual magic happens here:
|
||||
- labeled
|
||||
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
jobs:
|
||||
remove-label:
|
||||
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
|
||||
# The PR should be reviewed and labelled manually again.
|
||||
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
if: |
|
||||
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
create-branch:
|
||||
# Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
|
||||
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
if: |
|
||||
github.event.action == 'labeled' &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- run: gh pr checkout "${PR_NUMBER}"
|
||||
|
||||
- run: git checkout -b "ci-run/pr-${PR_NUMBER}"
|
||||
|
||||
- run: git push --force origin "ci-run/pr-${PR_NUMBER}"
|
||||
80
.github/workflows/build_and_test.yml
vendored
80
.github/workflows/build_and_test.yml
vendored
@@ -5,7 +5,6 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release
|
||||
- ci-run/pr-*
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -128,11 +127,6 @@ jobs:
|
||||
- name: Run cargo clippy (release)
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -396,11 +390,13 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pytest_split_group: [ 1, 2, 3, 4 ]
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Pytest benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -409,11 +405,9 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
@@ -471,26 +465,6 @@ jobs:
|
||||
--build-type ${BUILD_TYPE} \
|
||||
--ingest suites.json
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
BASE_S3_URL: ${{ steps.create-allure-report.outputs.base-s3-url }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors --recursive ${BASE_S3_URL}/data/test-cases ./test-cases
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
export DATABASE_URL="$TEST_RESULT_CONNSTR"
|
||||
poetry run python3 scripts/ingest_regress_test_result-new-format.py \
|
||||
--reference ${GITHUB_REF} \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--run-id ${GITHUB_RUN_ID} \
|
||||
--run-attempt ${GITHUB_RUN_ATTEMPT} \
|
||||
--test-cases-dir ./test-cases
|
||||
|
||||
coverage-report:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
@@ -814,7 +788,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.15.0-alpha1
|
||||
VM_BUILDER_VERSION: v0.12.1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -975,15 +949,22 @@ jobs:
|
||||
version: [ v14, v15 ]
|
||||
|
||||
env:
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
# While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
|
||||
# Later all the extensions will be moved to extensions image.
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
|
||||
COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
|
||||
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
|
||||
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
|
||||
S3_BUCKETS: |
|
||||
${{ github.ref_name == 'release' &&
|
||||
'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
|
||||
'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
|
||||
|
||||
steps:
|
||||
- name: Pull postgres-extensions image
|
||||
run: |
|
||||
docker pull ${EXTENSIONS_IMAGE}
|
||||
docker pull ${COMPUTE_NODE_IMAGE}
|
||||
|
||||
- name: Create postgres-extensions container
|
||||
id: create-container
|
||||
@@ -991,23 +972,44 @@ jobs:
|
||||
EID=$(docker create ${EXTENSIONS_IMAGE} true)
|
||||
echo "EID=${EID}" >> $GITHUB_OUTPUT
|
||||
|
||||
CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
|
||||
echo "CID=${CID}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Extract postgres-extensions from container
|
||||
run: |
|
||||
rm -rf ./extensions-to-upload # Just in case
|
||||
mkdir -p extensions-to-upload
|
||||
rm -rf ./extensions-to-upload ./custom-extensions # Just in case
|
||||
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
|
||||
# In compute image we have a bit different directory layout
|
||||
mkdir -p extensions-to-upload/share
|
||||
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
|
||||
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib
|
||||
|
||||
# Delete Neon extensitons (they always present on compute-node image)
|
||||
rm -rf ./extensions-to-upload/share/extension/neon*
|
||||
rm -rf ./extensions-to-upload/lib/neon*
|
||||
|
||||
# Delete leftovers from the extension build step
|
||||
rm -rf ./extensions-to-upload/lib/pgxs
|
||||
rm -rf ./extensions-to-upload/lib/pkgconfig
|
||||
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
|
||||
for EXT_NAME in $(ls ./custom-extensions); do
|
||||
mkdir -p ./extensions-to-upload/${EXT_NAME}/share
|
||||
|
||||
mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
|
||||
mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib
|
||||
done
|
||||
|
||||
- name: Upload postgres-extensions to S3
|
||||
run: |
|
||||
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
|
||||
for BUCKET in $(echo ${S3_BUCKETS}); do
|
||||
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
|
||||
done
|
||||
|
||||
- name: Cleanup
|
||||
if: ${{ always() && steps.create-container.outputs.EID }}
|
||||
if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
|
||||
run: |
|
||||
docker rm ${{ steps.create-container.outputs.CID }} || true
|
||||
docker rm ${{ steps.create-container.outputs.EID }} || true
|
||||
|
||||
deploy:
|
||||
@@ -1087,7 +1089,7 @@ jobs:
|
||||
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
|
||||
3
.github/workflows/neon_extra_builds.yml
vendored
3
.github/workflows/neon_extra_builds.yml
vendored
@@ -3,8 +3,7 @@ name: Check neon with extra platform builds
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- ci-run/pr-*
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
|
||||
139
Cargo.lock
generated
139
Cargo.lock
generated
@@ -740,9 +740,6 @@ name = "cc"
|
||||
version = "1.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
@@ -925,7 +922,6 @@ dependencies = [
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1979,15 +1975,6 @@ version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.63"
|
||||
@@ -2395,9 +2382,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.19.0"
|
||||
version = "0.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
|
||||
checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
|
||||
dependencies = [
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
@@ -2405,9 +2392,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-http"
|
||||
version = "0.8.0"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
|
||||
checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -2418,9 +2405,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.12.0"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
|
||||
checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures",
|
||||
@@ -2436,47 +2423,48 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "0.2.0"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
|
||||
checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"futures-util",
|
||||
"opentelemetry",
|
||||
"prost",
|
||||
"tonic 0.8.3",
|
||||
"tonic-build 0.8.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-semantic-conventions"
|
||||
version = "0.11.0"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
|
||||
checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_api"
|
||||
version = "0.19.0"
|
||||
version = "0.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
|
||||
checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"indexmap",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.19.0"
|
||||
version = "0.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
|
||||
checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"crossbeam-channel",
|
||||
@@ -2522,7 +2510,6 @@ dependencies = [
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
"svg_fmt",
|
||||
"tokio",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -2561,7 +2548,6 @@ dependencies = [
|
||||
"metrics",
|
||||
"nix",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pin-project-lite",
|
||||
@@ -2798,7 +2784,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2811,7 +2797,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -2822,7 +2808,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -2840,7 +2826,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2954,9 +2940,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.64"
|
||||
version = "1.0.58"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
|
||||
checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -3253,7 +3239,6 @@ dependencies = [
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
@@ -3346,9 +3331,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.4.5"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
|
||||
checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -3873,8 +3858,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
|
||||
source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
@@ -4017,7 +4001,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.9.2",
|
||||
"tonic-build",
|
||||
"tonic-build 0.9.2",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -4118,7 +4102,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"libc",
|
||||
"xattr 0.2.3",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4331,7 +4315,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -4399,17 +4383,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tar"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"futures-core",
|
||||
"libc",
|
||||
"redox_syscall 0.3.5",
|
||||
"redox_syscall 0.2.16",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"xattr 1.0.0",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4536,6 +4519,19 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
|
||||
dependencies = [
|
||||
"prettyplease 0.1.25",
|
||||
"proc-macro2",
|
||||
"prost-build",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.9.2"
|
||||
@@ -4659,9 +4655,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.19.0"
|
||||
version = "0.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
|
||||
checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
@@ -4886,7 +4882,6 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
@@ -5313,7 +5308,6 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"cc",
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_builder",
|
||||
@@ -5385,15 +5379,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xmlparser"
|
||||
version = "0.13.5"
|
||||
@@ -5414,33 +5399,3 @@ name = "zeroize"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.12.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "6.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.8+zstd.1.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
29
Cargo.toml
29
Cargo.toml
@@ -84,9 +84,9 @@ notify = "5.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.19.0"
|
||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.11.0"
|
||||
opentelemetry = "0.18.0"
|
||||
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.10.0"
|
||||
parking_lot = "0.12"
|
||||
pbkdf2 = "0.12.1"
|
||||
pin-project-lite = "0.2"
|
||||
@@ -95,7 +95,7 @@ prost = "0.11"
|
||||
rand = "0.8"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
reqwest-retry = "0.2.2"
|
||||
routerify = "3"
|
||||
@@ -124,14 +124,13 @@ tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.9.0"
|
||||
tokio-rustls = "0.23"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.19.0"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
@@ -144,11 +143,12 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -183,7 +183,12 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
|
||||
# Changes the MAX_THREADS limit from 4096 to 32768.
|
||||
# This is a temporary workaround for using tracing from many threads in safekeepers code,
|
||||
# until async safekeepers patch is merged to the main.
|
||||
sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -77,7 +77,6 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
|
||||
echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -90,28 +89,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||
mkdir -p /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
||||
|
||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
||||
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && cd build && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -431,16 +419,12 @@ RUN apt-get update && \
|
||||
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
|
||||
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
mkdir build && cd build && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -551,8 +535,10 @@ FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
|
||||
# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
|
||||
# There is no release tag yet
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -567,17 +553,16 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.ta
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
find /usr/local/pgsql -type f | sort > /before.txt && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
|
||||
find /usr/local/pgsql -type f | sort > /after.txt && \
|
||||
/bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -769,23 +754,16 @@ RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
# Extenstion only
|
||||
#
|
||||
#########################################################################################
|
||||
FROM python:3.9-slim-bullseye AS generate-ext-index
|
||||
ARG PG_VERSION
|
||||
ARG BUILD_TAG
|
||||
RUN apt update && apt install -y zstd
|
||||
|
||||
# copy the control files here
|
||||
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
|
||||
COPY --from=pg-anon-pg-build /extensions/ /extensions/
|
||||
COPY --from=postgis-build /extensions/ /extensions/
|
||||
COPY scripts/combine_control_files.py ./combine_control_files.py
|
||||
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
|
||||
|
||||
FROM scratch AS postgres-extensions
|
||||
# After the transition this layer will include all extensitons.
|
||||
# As for now, it's only a couple for testing purposses
|
||||
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
|
||||
COPY --from=generate-ext-index /ext_index.json /ext_index.json
|
||||
# As for now, it's only for new custom ones
|
||||
#
|
||||
# # Default extensions
|
||||
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
|
||||
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib
|
||||
# Custom extensions
|
||||
COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
|
||||
COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -816,7 +794,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost*, libfreetype6, and zlib1g for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
gdb \
|
||||
@@ -840,8 +817,7 @@ RUN apt update && \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
zlib1g \
|
||||
ca-certificates && \
|
||||
zlib1g && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
|
||||
2
Makefile
2
Makefile
@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
|
||||
+@echo "Compiling pageinspect $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
|
||||
+@echo "Compiling amcheck $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
|
||||
|
||||
.PHONY: postgres-clean-%
|
||||
postgres-clean-%:
|
||||
|
||||
11
README.md
11
README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
|
||||
```bash
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
|
||||
libcurl4-openssl-dev openssl python-poetry
|
||||
libcurl4-openssl-dev
|
||||
```
|
||||
* On Fedora, these packages are needed:
|
||||
```bash
|
||||
dnf install flex bison readline-devel zlib-devel openssl-devel \
|
||||
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
|
||||
protobuf-devel libcurl-devel openssl poetry
|
||||
protobuf-devel libcurl-devel
|
||||
```
|
||||
* On Arch based systems, these packages are needed:
|
||||
```bash
|
||||
@@ -235,13 +235,6 @@ CARGO_BUILD_FLAGS="--features=testing" make
|
||||
./scripts/pytest
|
||||
```
|
||||
|
||||
By default, this runs both debug and release modes, and all supported postgres versions. When
|
||||
testing locally, it is convenient to run just run one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
[docs](/docs) Contains a top-level overview of all available markdown documentation.
|
||||
|
||||
@@ -34,4 +34,3 @@ utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
zstd = "0.12.4"
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
|
||||
//! -r {"bucket": "my-bucket", "region": "eu-central-1", "endpoint": "http:://localhost:9000"} \
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -38,7 +38,7 @@ use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -51,6 +51,7 @@ use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::launch_download_extensions;
|
||||
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
@@ -58,26 +59,43 @@ use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
// in-case of not-set environment var
|
||||
const BUILD_TAG_DEFAULT: &str = "5670669815";
|
||||
const BUILD_TAG_DEFAULT: &str = "local";
|
||||
const DEFAULT_REMOTE_EXT_CONFIG: &str = r#"{"bucket": "neon-dev-extensions", "region": "eu-central-1", "endpoint": null, "prefix": "5555"}"#;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG")
|
||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||
.to_string();
|
||||
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
|
||||
let remote_ext_config = matches
|
||||
.get_one::<String>("remote-ext-config")
|
||||
.map(|x| x.to_string());
|
||||
// let remote_ext_config =
|
||||
// Some(remote_ext_config.unwrap_or(DEFAULT_REMOTE_EXT_CONFIG.to_string()));
|
||||
|
||||
let ext_remote_storage = remote_ext_config.map(|x| {
|
||||
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
|
||||
init_remote_storage(&x, build_tag)
|
||||
.expect("cannot initialize remote extension storage from config")
|
||||
});
|
||||
// creds used to connect to remote extensions bucket
|
||||
// let aws_creds = matches.get_one::<String>("awscreds");
|
||||
// if let Some(aws_creds) = aws_creds {
|
||||
// // not sure if this is a bad idea?
|
||||
// let aws_creds_dict: serde_json::Value = serde_json::from_str(aws_creds)?;
|
||||
// std::env::set_var(
|
||||
// "AWS_ACCESS_KEY_ID",
|
||||
// aws_creds_dict["ID"].as_str().expect("config parse error"),
|
||||
// );
|
||||
// std::env::set_var(
|
||||
// "AWS_SECRET_ACCESS_KEY",
|
||||
// aws_creds_dict["key"].as_str().expect("config parse error"),
|
||||
// );
|
||||
// }
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
@@ -196,21 +214,10 @@ fn main() -> Result<()> {
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage,
|
||||
ext_remote_paths: OnceLock::new(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
library_index: OnceLock::new(),
|
||||
build_tag,
|
||||
available_extensions: OnceLock::new(),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
// If this is a pooled VM, prewarm before starting HTTP server and becoming
|
||||
// available for binding. Prewarming helps postgres start quicker later,
|
||||
// because QEMU will already have it's memory allocated from the host, and
|
||||
// the necessary binaries will alreaady be cached.
|
||||
if !spec_set {
|
||||
compute.prewarm_postgres()?;
|
||||
}
|
||||
|
||||
// Launch http service first, so we were able to serve control-plane
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle =
|
||||
@@ -221,7 +228,6 @@ fn main() -> Result<()> {
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
@@ -252,8 +258,12 @@ fn main() -> Result<()> {
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute);
|
||||
let _configurator_handle = launch_configurator(&compute);
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
let _configurator_handle =
|
||||
launch_configurator(&compute).expect("cannot launch configurator thread");
|
||||
|
||||
let _download_extensions_handle =
|
||||
launch_download_extensions(&compute).expect("cannot launch download extensions thread");
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
@@ -393,6 +403,12 @@ fn cli() -> clap::Command {
|
||||
.long("remote-ext-config")
|
||||
.value_name("REMOTE_EXT_CONFIG"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("awscreds")
|
||||
.short('k')
|
||||
.long("awscreds")
|
||||
.value_name("AWS_CREDENTIALS"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,23 +1,19 @@
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Condvar, Mutex, OnceLock, RwLock};
|
||||
use std::time::Instant;
|
||||
use std::sync::{Condvar, Mutex, OnceLock};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use postgres::{Client, NoTls};
|
||||
use regex::Regex;
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
use tracing::{info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -25,11 +21,10 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
use crate::{config, extension_server};
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
@@ -60,22 +55,8 @@ pub struct ComputeNode {
|
||||
pub state_changed: Condvar,
|
||||
/// the S3 bucket that we search for extensions in
|
||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||
// (key: extension name, value: path to extension archive in remote storage)
|
||||
pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
|
||||
// (key: library name, value: name of extension containing this library)
|
||||
pub library_index: OnceLock<HashMap<String, String>>,
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RemoteExtensionMetrics {
|
||||
num_ext_downloaded: u64,
|
||||
largest_ext_size: u64,
|
||||
total_ext_download_size: u64,
|
||||
prep_extensions_ms: u64,
|
||||
// cached lists of available extensions and libraries
|
||||
pub available_extensions: OnceLock<HashSet<String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -115,7 +96,6 @@ pub struct ParsedSpec {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub pageserver_connstr: String,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
}
|
||||
|
||||
@@ -133,21 +113,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
|
||||
.ok_or("pageserver connstr should be provided")?;
|
||||
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
|
||||
if matches!(spec.mode, ComputeMode::Primary) {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.safekeepers")
|
||||
.ok_or("safekeeper connstrings should be provided")?
|
||||
.split(',')
|
||||
.map(|str| str.to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
} else {
|
||||
spec.safekeeper_connstrings.clone()
|
||||
};
|
||||
let storage_auth_token = spec.storage_auth_token.clone();
|
||||
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
|
||||
tenant_id
|
||||
@@ -173,7 +138,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
Ok(ParsedSpec {
|
||||
spec,
|
||||
pageserver_connstr,
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
@@ -286,7 +250,7 @@ impl ComputeNode {
|
||||
#[instrument(skip_all, fields(%lsn))]
|
||||
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let start_time = Instant::now();
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
|
||||
|
||||
@@ -299,10 +263,7 @@ impl ComputeNode {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
|
||||
// Connect to pageserver
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
|
||||
let basebackup_cmd = match lsn {
|
||||
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it
|
||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
|
||||
@@ -348,107 +309,14 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
// Report metrics
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
|
||||
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
|
||||
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_safekeepers_synced_async(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
) -> Result<Option<Lsn>> {
|
||||
// Construct a connection config for each safekeeper
|
||||
let pspec: ParsedSpec = compute_state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.clone();
|
||||
let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
|
||||
let sk_configs = sk_connstrs.into_iter().map(|connstr| {
|
||||
// Format connstr
|
||||
let id = connstr.clone();
|
||||
let connstr = format!("postgresql://no_user@{}", connstr);
|
||||
let options = format!(
|
||||
"-c timeline_id={} tenant_id={}",
|
||||
pspec.timeline_id, pspec.tenant_id
|
||||
);
|
||||
|
||||
// Construct client
|
||||
let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
|
||||
config.options(&options);
|
||||
if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
|
||||
config.password(storage_auth_token);
|
||||
}
|
||||
|
||||
(id, config)
|
||||
});
|
||||
|
||||
// Create task set to query all safekeepers
|
||||
let mut tasks = FuturesUnordered::new();
|
||||
let quorum = sk_configs.len() / 2 + 1;
|
||||
for (id, config) in sk_configs {
|
||||
let timeout = tokio::time::Duration::from_millis(100);
|
||||
let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
|
||||
tasks.push(tokio::spawn(task));
|
||||
}
|
||||
|
||||
// Get a quorum of responses or errors
|
||||
let mut responses = Vec::new();
|
||||
let mut join_errors = Vec::new();
|
||||
let mut task_errors = Vec::new();
|
||||
let mut timeout_errors = Vec::new();
|
||||
while let Some(response) = tasks.next().await {
|
||||
match response {
|
||||
Ok(Ok(Ok(r))) => responses.push(r),
|
||||
Ok(Ok(Err(e))) => task_errors.push(e),
|
||||
Ok(Err(e)) => timeout_errors.push(e),
|
||||
Err(e) => join_errors.push(e),
|
||||
};
|
||||
if responses.len() >= quorum {
|
||||
break;
|
||||
}
|
||||
if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// In case of error, log and fail the check, but don't crash.
|
||||
// We're playing it safe because these errors could be transient
|
||||
// and we don't yet retry. Also being careful here allows us to
|
||||
// be backwards compatible with safekeepers that don't have the
|
||||
// TIMELINE_STATUS API yet.
|
||||
if responses.len() < quorum {
|
||||
error!(
|
||||
"failed sync safekeepers check {:?} {:?} {:?}",
|
||||
join_errors, task_errors, timeout_errors
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(check_if_synced(responses))
|
||||
}
|
||||
|
||||
// Fast path for sync_safekeepers. If they're already synced we get the lsn
|
||||
// in one roundtrip. If not, we should do a full sync_safekeepers.
|
||||
pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
// Run actual work with new tokio runtime
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
|
||||
|
||||
// Record runtime
|
||||
self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
|
||||
self.state.lock().unwrap().metrics.basebackup_bytes =
|
||||
measured_reader.get_byte_count() as u64;
|
||||
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
@@ -521,14 +389,10 @@ impl ComputeNode {
|
||||
// cannot sync safekeepers.
|
||||
let lsn = match spec.mode {
|
||||
ComputeMode::Primary => {
|
||||
info!("checking if safekeepers are synced");
|
||||
let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
|
||||
lsn
|
||||
} else {
|
||||
info!("starting safekeepers syncing");
|
||||
self.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?
|
||||
};
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
lsn
|
||||
}
|
||||
@@ -566,50 +430,6 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start and stop a postgres process to warm up the VM for startup.
|
||||
pub fn prewarm_postgres(&self) -> Result<()> {
|
||||
info!("prewarming");
|
||||
|
||||
// Create pgdata
|
||||
let pgdata = &format!("{}.warmup", self.pgdata);
|
||||
create_pgdata(pgdata)?;
|
||||
|
||||
// Run initdb to completion
|
||||
info!("running initdb");
|
||||
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
|
||||
Command::new(initdb_bin)
|
||||
.args(["-D", pgdata])
|
||||
.output()
|
||||
.expect("cannot start initdb process");
|
||||
|
||||
// Write conf
|
||||
use std::io::Write;
|
||||
let conf_path = Path::new(pgdata).join("postgresql.conf");
|
||||
let mut file = std::fs::File::create(conf_path)?;
|
||||
writeln!(file, "shared_buffers=65536")?;
|
||||
writeln!(file, "port=51055")?; // Nobody should be connecting
|
||||
writeln!(file, "shared_preload_libraries = 'neon'")?;
|
||||
|
||||
// Start postgres
|
||||
info!("starting postgres");
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(["-D", pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Stop it when it's ready
|
||||
info!("waiting for postgres");
|
||||
wait_for_postgres(&mut pg, Path::new(pgdata))?;
|
||||
pg.kill()?;
|
||||
info!("sent kill signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
|
||||
// clean up
|
||||
let _ok = fs::remove_dir_all(pgdata);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
#[instrument(skip_all)]
|
||||
@@ -749,7 +569,7 @@ impl ComputeNode {
|
||||
// remote shared_preload_libraries before postgres start (if any)
|
||||
{
|
||||
let library_load_start_time = Utc::now();
|
||||
let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
|
||||
self.prepare_preload_libraries(&compute_state)?;
|
||||
|
||||
let library_load_time = Utc::now()
|
||||
.signed_duration_since(library_load_start_time)
|
||||
@@ -757,16 +577,11 @@ impl ComputeNode {
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.load_ext_ms = library_load_time;
|
||||
state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
|
||||
state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
|
||||
state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
|
||||
state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
|
||||
state.metrics.load_libraries_ms = library_load_time;
|
||||
info!(
|
||||
"Loading shared_preload_libraries took {:?}ms",
|
||||
library_load_time
|
||||
);
|
||||
info!("{:?}", remote_ext_metrics);
|
||||
}
|
||||
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
@@ -920,153 +735,46 @@ LIMIT 100",
|
||||
|
||||
// If remote extension storage is configured,
|
||||
// download extension control files
|
||||
#[tokio::main]
|
||||
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
|
||||
info!("custom extensions: {:?}", &custom_ext);
|
||||
|
||||
let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
|
||||
let custom_ext_prefixes = spec.custom_extensions.clone().unwrap_or(Vec::new());
|
||||
info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
|
||||
let available_extensions = extension_server::get_available_extensions(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&custom_ext,
|
||||
&self.build_tag,
|
||||
&custom_ext_prefixes,
|
||||
)
|
||||
.await?;
|
||||
self.ext_remote_paths
|
||||
.set(ext_remote_paths)
|
||||
.expect("this is the only time we set ext_remote_paths");
|
||||
self.library_index
|
||||
.set(library_index)
|
||||
.expect("this is the only time we set library_index");
|
||||
self.available_extensions
|
||||
.set(available_extensions)
|
||||
.expect("available_extensions.set error");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// download an archive, unzip and place files in correct locations
|
||||
pub async fn download_extension(
|
||||
&self,
|
||||
ext_name: &str,
|
||||
is_library: bool,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let remote_storage = self
|
||||
.ext_remote_storage
|
||||
.as_ref()
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
|
||||
let mut real_ext_name = ext_name;
|
||||
if is_library {
|
||||
// sometimes library names might have a suffix like
|
||||
// library.so or library.so.3. We strip this off
|
||||
// because library_index is based on the name without the file extension
|
||||
let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
|
||||
let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
|
||||
|
||||
real_ext_name = self
|
||||
.library_index
|
||||
.get()
|
||||
.expect("must have already downloaded the library_index")
|
||||
.get(&lib_raw_name)
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"library {} is not found",
|
||||
lib_raw_name
|
||||
)))?;
|
||||
}
|
||||
|
||||
let ext_path = &self
|
||||
.ext_remote_paths
|
||||
.get()
|
||||
.expect("error accessing ext_remote_paths")
|
||||
.get(real_ext_name)
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"real_ext_name {} is not found",
|
||||
real_ext_name
|
||||
)))?;
|
||||
|
||||
let ext_archive_name = ext_path.object_name().expect("bad path");
|
||||
|
||||
let mut first_try = false;
|
||||
if !self
|
||||
.ext_download_progress
|
||||
.read()
|
||||
.expect("lock err")
|
||||
.contains_key(ext_archive_name)
|
||||
{
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("lock err")
|
||||
.insert(ext_archive_name.to_string(), (Utc::now(), false));
|
||||
first_try = true;
|
||||
}
|
||||
let (download_start, download_completed) =
|
||||
self.ext_download_progress.read().expect("lock err")[ext_archive_name];
|
||||
let start_time_delta = Utc::now()
|
||||
.signed_duration_since(download_start)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
|
||||
// how long to wait for extension download if it was started by another process
|
||||
const HANG_TIMEOUT: u64 = 3000; // milliseconds
|
||||
|
||||
if download_completed {
|
||||
info!("extension already downloaded, skipping re-download");
|
||||
return Ok(0);
|
||||
} else if start_time_delta < HANG_TIMEOUT && !first_try {
|
||||
info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
|
||||
let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500));
|
||||
loop {
|
||||
info!("waiting for download");
|
||||
interval.tick().await;
|
||||
let (_, download_completed_now) =
|
||||
self.ext_download_progress.read().expect("lock")[ext_archive_name];
|
||||
if download_completed_now {
|
||||
info!("download finished by whoever else downloaded it");
|
||||
return Ok(0);
|
||||
}
|
||||
pub async fn download_extension(&self, ext_name: &str) -> Result<()> {
|
||||
match &self.ext_remote_storage {
|
||||
None => anyhow::bail!("No remote extension storage"),
|
||||
Some(remote_storage) => {
|
||||
extension_server::download_extension(
|
||||
ext_name,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
)
|
||||
.await
|
||||
}
|
||||
// NOTE: the above loop will get terminated
|
||||
// based on the timeout of the download function
|
||||
}
|
||||
|
||||
// if extension hasn't been downloaded before or the previous
|
||||
// attempt to download was at least HANG_TIMEOUT ms ago
|
||||
// then we try to download it here
|
||||
info!("downloading new extension {ext_archive_name}");
|
||||
|
||||
let download_size = extension_server::download_extension(
|
||||
real_ext_name,
|
||||
ext_path,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
)
|
||||
.await
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
|
||||
download_size
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn prepare_preload_libraries(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
) -> Result<RemoteExtensionMetrics> {
|
||||
pub async fn prepare_preload_libraries(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if self.ext_remote_storage.is_none() {
|
||||
return Ok(RemoteExtensionMetrics {
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
total_ext_download_size: 0,
|
||||
prep_extensions_ms: 0,
|
||||
});
|
||||
return Ok(());
|
||||
}
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
@@ -1101,57 +809,15 @@ LIMIT 100",
|
||||
libs_vec.extend(preload_libs_vec);
|
||||
}
|
||||
|
||||
info!("Download ext_index.json, find the extension paths");
|
||||
let prep_ext_start_time = Utc::now();
|
||||
self.prepare_external_extensions(compute_state).await?;
|
||||
let prep_ext_time_delta = Utc::now()
|
||||
.signed_duration_since(prep_ext_start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
info!("Prepare extensions took {prep_ext_time_delta}ms");
|
||||
|
||||
// // Don't try to download libraries that are not in the index.
|
||||
// // Assume that they are already present locally.
|
||||
// libs_vec.retain(|lib| {
|
||||
// self.library_index
|
||||
// .get()
|
||||
// .expect("error accessing ext_remote_paths")
|
||||
// .contains_key(lib)
|
||||
// });
|
||||
|
||||
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
|
||||
|
||||
let mut download_tasks = Vec::new();
|
||||
for library in &libs_vec {
|
||||
download_tasks.push(self.download_extension(library, true));
|
||||
download_tasks.push(self.download_extension(library));
|
||||
}
|
||||
let results = join_all(download_tasks).await;
|
||||
|
||||
let mut remote_ext_metrics = RemoteExtensionMetrics {
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
total_ext_download_size: 0,
|
||||
prep_extensions_ms: prep_ext_time_delta,
|
||||
};
|
||||
for result in results {
|
||||
let download_size = match result {
|
||||
Ok(res) => {
|
||||
remote_ext_metrics.num_ext_downloaded += 1;
|
||||
res
|
||||
}
|
||||
Err(err) => {
|
||||
// if we failed to download an extension, we don't want to fail the whole
|
||||
// process, but we do want to log the error
|
||||
error!("Failed to download extension: {}", err);
|
||||
0
|
||||
}
|
||||
};
|
||||
|
||||
remote_ext_metrics.largest_ext_size =
|
||||
std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
|
||||
remote_ext_metrics.total_ext_download_size += download_size;
|
||||
result?; // propogate any errors
|
||||
}
|
||||
Ok(remote_ext_metrics)
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Result;
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
@@ -41,7 +42,9 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
pub fn launch_configurator(
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
let compute = Arc::clone(compute);
|
||||
|
||||
thread::Builder::new()
|
||||
@@ -50,5 +53,4 @@ pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()>
|
||||
configurator_main_loop(&compute);
|
||||
info!("configurator thread is exited");
|
||||
})
|
||||
.expect("cannot launch configurator thread")
|
||||
}
|
||||
|
||||
@@ -1,91 +1,35 @@
|
||||
// Download extension files from the extension store
|
||||
// and put them in the right place in the postgres directory (share / lib)
|
||||
// and put them in the right place in the postgres directory
|
||||
/*
|
||||
The layout of the S3 bucket is as follows:
|
||||
5615610098 // this is an extension build number
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ ├── anon.tar.zst
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ ├── anon.tar.zst
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
5615261079
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── anon.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── anon.tar.zst
|
||||
└── ext_index.json
|
||||
5623261088
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
|
||||
Note that build number cannot be part of prefix because we might need extensions
|
||||
from other build numbers.
|
||||
v14/ext_index.json
|
||||
-- this contains information necessary to create control files
|
||||
v14/extensions/test_ext1.tar.gz
|
||||
-- this contains the library files and sql files necessary to create this extension
|
||||
v14/extensions/custom_ext1.tar.gz
|
||||
|
||||
ext_index.json stores the control files and location of extension archives
|
||||
It also stores a list of public extensions and a library_index
|
||||
The difference between a private and public extensions is determined by who can
|
||||
load the extension this is specified in ext_index.json
|
||||
|
||||
We don't need to duplicate extension.tar.zst files.
|
||||
We only need to upload a new one if it is updated.
|
||||
(Although currently we just upload every time anyways, hopefully will change
|
||||
this sometime)
|
||||
|
||||
*access* is controlled by spec
|
||||
|
||||
More specifically, here is an example ext_index.json
|
||||
{
|
||||
"public_extensions": [
|
||||
"anon",
|
||||
"pg_buffercache"
|
||||
],
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
"pg_buffercache": "pg_buffercache"
|
||||
},
|
||||
"extension_data": {
|
||||
"pg_buffercache": {
|
||||
"control_data": {
|
||||
"pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
|
||||
},
|
||||
"anon": {
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/anon.tar.zst"
|
||||
}
|
||||
}
|
||||
}
|
||||
Speicially, ext_index.json has a list of public extensions, and a list of
|
||||
extensions enabled for specific tenant-ids.
|
||||
*/
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::Context;
|
||||
use anyhow::{self, Result};
|
||||
use futures::future::join_all;
|
||||
use flate2::read::GzDecoder;
|
||||
use remote_storage::*;
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Read;
|
||||
use serde_json::{self, Value};
|
||||
use std::collections::HashSet;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use tar::Archive;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::info;
|
||||
use tracing::log::warn;
|
||||
use zstd::stream::read::Decoder;
|
||||
|
||||
fn get_pg_config(argument: &str, pgbin: &str) -> String {
|
||||
// gives the result of `pg_config [argument]`
|
||||
@@ -117,156 +61,154 @@ pub fn get_pg_version(pgbin: &str) -> String {
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
// download control files for enabled_extensions
|
||||
// return Hashmaps converting library names to extension names (library_index)
|
||||
// and specifying the remote path to the archive for each extension name
|
||||
// download extension control files
|
||||
// if custom_ext_prefixes is provided - search also in custom extension paths
|
||||
pub async fn get_available_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
custom_extensions: &[String],
|
||||
build_tag: &str,
|
||||
) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
|
||||
custom_ext_prefixes: &[String],
|
||||
) -> Result<HashSet<String>> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
|
||||
let index_path = pg_version.to_owned() + "/ext_index.json";
|
||||
let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
|
||||
info!("download ext_index.json from: {:?}", &index_path);
|
||||
info!("download ext_index.json: {:?}", &index_path);
|
||||
|
||||
// TODO: potential optimization: cache ext_index.json
|
||||
let mut download = remote_storage.download(&index_path).await?;
|
||||
let mut ext_idx_buffer = Vec::new();
|
||||
let mut write_data_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut ext_idx_buffer)
|
||||
.read_to_end(&mut write_data_buffer)
|
||||
.await?;
|
||||
info!("ext_index downloaded");
|
||||
let ext_index_str = match str::from_utf8(&write_data_buffer) {
|
||||
Ok(v) => v,
|
||||
Err(e) => panic!("Invalid UTF-8 sequence: {}", e),
|
||||
};
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct Index {
|
||||
public_extensions: Vec<String>,
|
||||
library_index: HashMap<String, String>,
|
||||
extension_data: HashMap<String, ExtensionData>,
|
||||
}
|
||||
let ext_index_full: Value = serde_json::from_str(ext_index_str)?;
|
||||
let ext_index_full = ext_index_full.as_object().context("error parsing json")?;
|
||||
let control_data = ext_index_full["control_data"]
|
||||
.as_object()
|
||||
.context("json parse error")?;
|
||||
let enabled_extensions = ext_index_full["enabled_extensions"]
|
||||
.as_object()
|
||||
.context("json parse error")?;
|
||||
info!("{:?}", control_data.clone());
|
||||
info!("{:?}", enabled_extensions.clone());
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct ExtensionData {
|
||||
control_data: HashMap<String, String>,
|
||||
archive_path: String,
|
||||
}
|
||||
|
||||
let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
|
||||
let mut enabled_extensions = ext_index_full.public_extensions;
|
||||
enabled_extensions.extend_from_slice(custom_extensions);
|
||||
let library_index = ext_index_full.library_index;
|
||||
let all_extension_data = ext_index_full.extension_data;
|
||||
info!("library_index: {:?}", library_index);
|
||||
|
||||
info!("enabled_extensions: {:?}", enabled_extensions);
|
||||
let mut ext_remote_paths = HashMap::new();
|
||||
let mut file_create_tasks = Vec::new();
|
||||
for extension in enabled_extensions {
|
||||
let ext_data = &all_extension_data[&extension];
|
||||
for (control_file, control_contents) in &ext_data.control_data {
|
||||
let extension_name = control_file
|
||||
.strip_suffix(".control")
|
||||
.expect("control files must end in .control");
|
||||
let control_path = local_sharedir.join(control_file);
|
||||
if !control_path.exists() {
|
||||
ext_remote_paths.insert(
|
||||
extension_name.to_string(),
|
||||
RemotePath::from_string(&ext_data.archive_path)?,
|
||||
);
|
||||
info!("writing file {:?}{:?}", control_path, control_contents);
|
||||
file_create_tasks.push(tokio::fs::write(control_path, control_contents));
|
||||
} else {
|
||||
warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
|
||||
let mut prefixes = vec!["public".to_string()];
|
||||
prefixes.extend(custom_ext_prefixes.to_owned());
|
||||
info!("{:?}", &prefixes);
|
||||
let mut all_extensions = HashSet::new();
|
||||
for prefix in prefixes {
|
||||
let prefix_extensions = match enabled_extensions.get(&prefix) {
|
||||
Some(Value::Array(ext_name)) => ext_name,
|
||||
_ => {
|
||||
info!("prefix {} has no extensions", prefix);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
info!("{:?}", prefix_extensions);
|
||||
for ext_name in prefix_extensions {
|
||||
all_extensions.insert(ext_name.as_str().context("json parse error")?.to_string());
|
||||
}
|
||||
}
|
||||
let results = join_all(file_create_tasks).await;
|
||||
for result in results {
|
||||
result?;
|
||||
|
||||
for prefix in &all_extensions {
|
||||
let control_contents = control_data[prefix].as_str().context("json parse error")?;
|
||||
let control_path = local_sharedir.join(prefix.to_owned() + ".control");
|
||||
|
||||
info!("WRITING FILE {:?}{:?}", control_path, control_contents);
|
||||
std::fs::write(control_path, control_contents)?;
|
||||
}
|
||||
info!("ext_remote_paths {:?}", ext_remote_paths);
|
||||
Ok((ext_remote_paths, library_index))
|
||||
|
||||
Ok(all_extensions.into_iter().collect())
|
||||
}
|
||||
|
||||
// download the archive for a given extension,
|
||||
// unzip it, and place files in the appropriate locations (share/lib)
|
||||
// download all sqlfiles (and possibly data files) for a given extension name
|
||||
pub async fn download_extension(
|
||||
ext_name: &str,
|
||||
ext_path: &RemotePath,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
) -> Result<u64> {
|
||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||
let mut download = remote_storage.download(ext_path).await?;
|
||||
let mut download_buffer = Vec::new();
|
||||
pg_version: &str,
|
||||
) -> Result<()> {
|
||||
// TODO: potential optimization: only download the extension if it doesn't exist
|
||||
// problem: how would we tell if it exists?
|
||||
let ext_name = ext_name.replace(".so", "");
|
||||
let ext_name_targz = ext_name.to_owned() + ".tar.gz";
|
||||
if Path::new(&ext_name_targz).exists() {
|
||||
info!("extension {:?} already exists", ext_name_targz);
|
||||
return Ok(());
|
||||
}
|
||||
let ext_path = RemotePath::new(
|
||||
&Path::new(pg_version)
|
||||
.join("extensions")
|
||||
.join(ext_name_targz.clone()),
|
||||
)?;
|
||||
info!(
|
||||
"Start downloading extension {:?} from {:?}",
|
||||
ext_name, ext_path
|
||||
);
|
||||
let mut download = remote_storage.download(&ext_path).await?;
|
||||
let mut write_data_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut download_buffer)
|
||||
.read_to_end(&mut write_data_buffer)
|
||||
.await?;
|
||||
let download_size = download_buffer.len() as u64;
|
||||
// it's unclear whether it is more performant to decompress into memory or not
|
||||
// TODO: decompressing into memory can be avoided
|
||||
let mut decoder = Decoder::new(download_buffer.as_slice())?;
|
||||
let mut decompress_buffer = Vec::new();
|
||||
decoder.read_to_end(&mut decompress_buffer)?;
|
||||
let mut archive = Archive::new(decompress_buffer.as_slice());
|
||||
let unzip_dest = pgbin
|
||||
.strip_suffix("/bin/postgres")
|
||||
.expect("bad pgbin")
|
||||
.to_string()
|
||||
+ "/download_extensions";
|
||||
archive.unpack(&unzip_dest)?;
|
||||
let unzip_dest = pgbin.strip_suffix("/bin/postgres").expect("bad pgbin");
|
||||
let tar = GzDecoder::new(write_data_buffer.as_slice());
|
||||
let mut archive = Archive::new(tar);
|
||||
archive.unpack(unzip_dest)?;
|
||||
info!("Download + unzip {:?} completed successfully", &ext_path);
|
||||
|
||||
let sharedir_paths = (
|
||||
unzip_dest.to_string() + "/share/extension",
|
||||
Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
|
||||
);
|
||||
let libdir_paths = (
|
||||
unzip_dest.to_string() + "/lib",
|
||||
Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(),
|
||||
);
|
||||
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
|
||||
for paths in [sharedir_paths, libdir_paths] {
|
||||
let (zip_dir, real_dir) = paths;
|
||||
info!("mv {zip_dir:?}/* {real_dir:?}");
|
||||
for file in std::fs::read_dir(zip_dir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
|
||||
info!("moving {old_file:?} to {new_file:?}");
|
||||
|
||||
// extension download failed: Directory not empty (os error 39)
|
||||
match std::fs::rename(old_file, new_file) {
|
||||
Ok(()) => info!("move succeeded"),
|
||||
Err(e) => {
|
||||
warn!("move failed, probably because the extension already exists: {e}")
|
||||
}
|
||||
}
|
||||
}
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let zip_sharedir = format!("{unzip_dest}/extensions/{ext_name}/share/extension");
|
||||
info!("mv {zip_sharedir:?}/* {local_sharedir:?}");
|
||||
for file in std::fs::read_dir(zip_sharedir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&local_sharedir).join(old_file.file_name().context("error parsing file")?);
|
||||
std::fs::rename(old_file, new_file)?;
|
||||
}
|
||||
info!("done moving extension {ext_name}");
|
||||
Ok(download_size)
|
||||
let local_libdir = Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql");
|
||||
let zip_libdir = format!("{unzip_dest}/extensions/{ext_name}/lib");
|
||||
info!("mv {zip_libdir:?}/* {local_libdir:?}");
|
||||
for file in std::fs::read_dir(zip_libdir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&local_libdir).join(old_file.file_name().context("error parsing file")?);
|
||||
std::fs::rename(old_file, new_file)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This function initializes the necessary structs to use remote storage
|
||||
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct RemoteExtJson {
|
||||
bucket: String,
|
||||
region: String,
|
||||
endpoint: Option<String>,
|
||||
prefix: Option<String>,
|
||||
}
|
||||
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
|
||||
// This function initializes the necessary structs to use remmote storage (should be fairly cheap)
|
||||
pub fn init_remote_storage(
|
||||
remote_ext_config: &str,
|
||||
default_prefix: &str,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
|
||||
|
||||
let remote_ext_bucket = remote_ext_config["bucket"]
|
||||
.as_str()
|
||||
.context("config parse error")?;
|
||||
let remote_ext_region = remote_ext_config["region"]
|
||||
.as_str()
|
||||
.context("config parse error")?;
|
||||
let remote_ext_endpoint = remote_ext_config["endpoint"].as_str();
|
||||
let remote_ext_prefix = remote_ext_config["prefix"]
|
||||
.as_str()
|
||||
.unwrap_or(default_prefix)
|
||||
.to_string();
|
||||
|
||||
// TODO: potentially allow modification of other parameters
|
||||
// however, default values should be fine for now
|
||||
let config = S3Config {
|
||||
bucket_name: remote_ext_json.bucket,
|
||||
bucket_region: remote_ext_json.region,
|
||||
prefix_in_bucket: remote_ext_json.prefix,
|
||||
endpoint: remote_ext_json.endpoint,
|
||||
bucket_name: remote_ext_bucket.to_string(),
|
||||
bucket_region: remote_ext_region.to_string(),
|
||||
prefix_in_bucket: Some(remote_ext_prefix),
|
||||
endpoint: remote_ext_endpoint.map(|x| x.to_string()),
|
||||
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_keys_per_list_response: None,
|
||||
};
|
||||
@@ -277,3 +219,19 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config)
|
||||
}
|
||||
|
||||
pub fn launch_download_extensions(
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
let compute = Arc::clone(compute);
|
||||
thread::Builder::new()
|
||||
.name("download-extensions".into())
|
||||
.spawn(move || {
|
||||
info!("start download_extension_files");
|
||||
let compute_state = compute.state.lock().expect("error unlocking compute.state");
|
||||
compute
|
||||
.prepare_external_extensions(&compute_state)
|
||||
.expect("error preparing extensions");
|
||||
info!("download_extension_files done, exiting thread");
|
||||
})
|
||||
}
|
||||
|
||||
@@ -125,32 +125,13 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
|
||||
let mut is_library = false;
|
||||
if let Some(params) = req.uri().query() {
|
||||
info!("serving {:?} POST request with params: {}", route, params);
|
||||
if params == "is_library=true" {
|
||||
is_library = true;
|
||||
} else {
|
||||
let mut resp = Response::new(Body::from("Wrong request parameters"));
|
||||
*resp.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return resp;
|
||||
}
|
||||
}
|
||||
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
|
||||
info!(
|
||||
"serving /extension_server POST request, filename: {:?}",
|
||||
&filename
|
||||
);
|
||||
|
||||
// don't even try to download extensions
|
||||
// if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
info!("no extensions remote storage configured");
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
|
||||
match compute.download_extension(&filename, is_library).await {
|
||||
match compute.download_extension(&filename).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
|
||||
@@ -14,4 +14,3 @@ pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod spec;
|
||||
pub mod sync_sk;
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::sync::Arc;
|
||||
use std::{thread, time};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, info};
|
||||
@@ -104,11 +105,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
thread::Builder::new()
|
||||
.name("compute-monitor".into())
|
||||
.spawn(move || watch_compute_activity(&state))
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // mil
|
||||
/// Escape a string for including it in a SQL literal. Wrapping the result
|
||||
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
||||
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
||||
/// See https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47
|
||||
/// for the original implementation.
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
let res = s.replace('\'', "''").replace('\\', "\\\\");
|
||||
|
||||
@@ -1,98 +0,0 @@
|
||||
// Utils for running sync_safekeepers
|
||||
use anyhow::Result;
|
||||
use tracing::info;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum TimelineStatusResponse {
|
||||
NotFound,
|
||||
Ok(TimelineStatusOkResponse),
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct TimelineStatusOkResponse {
|
||||
flush_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Get a safekeeper's metadata for our timeline. The id is only used for logging
|
||||
pub async fn ping_safekeeper(
|
||||
id: String,
|
||||
config: tokio_postgres::Config,
|
||||
) -> Result<TimelineStatusResponse> {
|
||||
// TODO add retries
|
||||
|
||||
// Connect
|
||||
info!("connecting to {}", id);
|
||||
let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = conn.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
// Query
|
||||
info!("querying {}", id);
|
||||
let result = client.simple_query("TIMELINE_STATUS").await?;
|
||||
|
||||
// Parse result
|
||||
info!("done with {}", id);
|
||||
if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
|
||||
use std::str::FromStr;
|
||||
let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
|
||||
flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
|
||||
commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
|
||||
});
|
||||
Ok(response)
|
||||
} else {
|
||||
// Timeline doesn't exist
|
||||
Ok(TimelineStatusResponse::NotFound)
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a quorum of responses, check if safekeepers are synced at some Lsn
|
||||
pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
|
||||
// Check if all responses are ok
|
||||
let ok_responses: Vec<TimelineStatusOkResponse> = responses
|
||||
.iter()
|
||||
.filter_map(|r| match r {
|
||||
TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
|
||||
_ => None,
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
if ok_responses.len() < responses.len() {
|
||||
info!(
|
||||
"not synced. Only {} out of {} know about this timeline",
|
||||
ok_responses.len(),
|
||||
responses.len()
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
// Get the min and the max of everything
|
||||
let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
|
||||
let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
|
||||
let commit_max = commit.iter().max().unwrap();
|
||||
let commit_min = commit.iter().min().unwrap();
|
||||
let flush_max = flush.iter().max().unwrap();
|
||||
let flush_min = flush.iter().min().unwrap();
|
||||
|
||||
// Check that all values are equal
|
||||
if commit_min != commit_max {
|
||||
info!("not synced. {:?} {:?}", commit_min, commit_max);
|
||||
return None;
|
||||
}
|
||||
if flush_min != flush_max {
|
||||
info!("not synced. {:?} {:?}", flush_min, flush_max);
|
||||
return None;
|
||||
}
|
||||
|
||||
// Check that commit == flush
|
||||
if commit_max != flush_max {
|
||||
info!("not synced. {:?} {:?}", commit_max, flush_max);
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(*commit_max)
|
||||
}
|
||||
@@ -10,7 +10,7 @@
|
||||
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
|
||||
//! The pid stored in the file is later used to stop the service.
|
||||
//!
|
||||
//! See the [`lock_file`](utils::lock_file) module for more info.
|
||||
//! See [`lock_file`] module for more info.
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -2,9 +2,8 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
//!
|
||||
use anyhow::Context;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -2,9 +2,7 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each endpoint is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/endpoints/<endpoint id>
|
||||
//! ```
|
||||
//!
|
||||
//! Some basic information about the endpoint, like the tenant and timeline IDs,
|
||||
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
|
||||
@@ -24,7 +22,7 @@
|
||||
//!
|
||||
//! Directory contents:
|
||||
//!
|
||||
//! ```text
|
||||
//! ```ignore
|
||||
//! .neon/endpoints/main/
|
||||
//! compute.log - log output of `compute_ctl` and `postgres`
|
||||
//! endpoint.json - serialized `EndpointConf` struct
|
||||
@@ -289,7 +287,7 @@ impl Endpoint {
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.get_compute_port()))
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("neon.safekeepers", &safekeepers);
|
||||
@@ -318,7 +316,7 @@ impl Endpoint {
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|x| x.get_compute_port().to_string())
|
||||
.map(|x| x.pg_port.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
|
||||
@@ -468,7 +466,7 @@ impl Endpoint {
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -493,7 +491,15 @@ impl Endpoint {
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
custom_extensions: Some(vec![]),
|
||||
// TODO FIXME: This is a hack to test custom extensions locally.
|
||||
// In test_download_extensions, we assume that the custom extension
|
||||
// prefix is the tenant ID. So we set it here.
|
||||
//
|
||||
// The proper way to implement this is to pass the custom extension
|
||||
// in spec, but we don't have a way to do that yet in the python tests.
|
||||
// NEW HACK: we enable the anon custom extension for everyone! this is of course just for testing
|
||||
// how will we do it for real?
|
||||
custom_extensions: Some(vec!["123454321".to_string(), self.tenant_id.to_string()]),
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -575,7 +581,9 @@ impl Endpoint {
|
||||
}
|
||||
Err(e) => {
|
||||
if attempt == MAX_ATTEMPTS {
|
||||
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
|
||||
return Err(e).context(
|
||||
"timed out waiting to connect to compute_ctl HTTP; last error: {e}",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,7 +137,6 @@ impl Default for PageServerConf {
|
||||
pub struct SafekeeperConf {
|
||||
pub id: NodeId,
|
||||
pub pg_port: u16,
|
||||
pub pg_tenant_only_port: Option<u16>,
|
||||
pub http_port: u16,
|
||||
pub sync: bool,
|
||||
pub remote_storage: Option<String>,
|
||||
@@ -150,7 +149,6 @@ impl Default for SafekeeperConf {
|
||||
Self {
|
||||
id: NodeId(0),
|
||||
pg_port: 0,
|
||||
pg_tenant_only_port: None,
|
||||
http_port: 0,
|
||||
sync: true,
|
||||
remote_storage: None,
|
||||
@@ -160,14 +158,6 @@ impl Default for SafekeeperConf {
|
||||
}
|
||||
}
|
||||
|
||||
impl SafekeeperConf {
|
||||
/// Compute is served by port on which only tenant scoped tokens allowed, if
|
||||
/// it is configured.
|
||||
pub fn get_compute_port(&self) -> u16 {
|
||||
self.pg_tenant_only_port.unwrap_or(self.pg_port)
|
||||
}
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
pub fn pg_distrib_dir_raw(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.clone()
|
||||
|
||||
@@ -2,9 +2,8 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
//!
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
@@ -120,55 +119,45 @@ impl SafekeeperNode {
|
||||
let availability_zone = format!("sk-{}", id_string);
|
||||
|
||||
let mut args = vec![
|
||||
"-D".to_owned(),
|
||||
datadir
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned(),
|
||||
"--id".to_owned(),
|
||||
id_string,
|
||||
"--listen-pg".to_owned(),
|
||||
listen_pg,
|
||||
"--listen-http".to_owned(),
|
||||
listen_http,
|
||||
"--availability-zone".to_owned(),
|
||||
availability_zone,
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
"--id",
|
||||
&id_string,
|
||||
"--listen-pg",
|
||||
&listen_pg,
|
||||
"--listen-http",
|
||||
&listen_http,
|
||||
"--availability-zone",
|
||||
&availability_zone,
|
||||
];
|
||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
|
||||
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
||||
}
|
||||
if !self.conf.sync {
|
||||
args.push("--no-sync".to_owned());
|
||||
args.push("--no-sync");
|
||||
}
|
||||
|
||||
let broker_endpoint = format!("{}", self.env.broker.client_url());
|
||||
args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);
|
||||
args.extend(["--broker-endpoint", &broker_endpoint]);
|
||||
|
||||
let mut backup_threads = String::new();
|
||||
if let Some(threads) = self.conf.backup_threads {
|
||||
backup_threads = threads.to_string();
|
||||
args.extend(["--backup-threads".to_owned(), backup_threads]);
|
||||
args.extend(["--backup-threads", &backup_threads]);
|
||||
} else {
|
||||
drop(backup_threads);
|
||||
}
|
||||
|
||||
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||
args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
|
||||
args.extend(["--remote-storage", remote_storage]);
|
||||
}
|
||||
|
||||
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
|
||||
if self.conf.auth_enabled {
|
||||
args.extend([
|
||||
"--auth-validation-public-key-path".to_owned(),
|
||||
key_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned(),
|
||||
"--auth-validation-public-key-path",
|
||||
key_path.to_str().with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
]);
|
||||
}
|
||||
|
||||
|
||||
@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.
|
||||
|
||||
In async Rust, futures can be "cancelled" at any await point, by
|
||||
dropping the Future. For example, `tokio::select!` returns as soon as
|
||||
one of the Futures returns, and drops the others. `tokio::time::timeout`
|
||||
is another example. In the Rust ecosystem, some functions are
|
||||
one of the Futures returns, and drops the others. `tokio::timeout!` is
|
||||
another example. In the Rust ecosystem, some functions are
|
||||
cancellation-safe, meaning they can be safely dropped without
|
||||
side-effects, while others are not. See documentation of
|
||||
`tokio::select!` for examples.
|
||||
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
|
||||
cancellation-safe, and must be polled to completion.
|
||||
|
||||
The downside of non-cancellation safe code is that you have to be very
|
||||
careful when using `tokio::select!`, `tokio::time::timeout`, and other
|
||||
such functions that can cause a Future to be dropped. They can only be
|
||||
used with functions that are explicitly documented to be cancellation-safe,
|
||||
careful when using `tokio::select!`, `tokio::timeout!`, and other such
|
||||
functions that can cause a Future to be dropped. They can only be used
|
||||
with functions that are explicitly documented to be cancellation-safe,
|
||||
or you need to spawn a separate task to shield from the cancellation.
|
||||
|
||||
At the entry points to the code, we also take care to poll futures to
|
||||
|
||||
@@ -141,86 +141,37 @@ popular extensions.
|
||||
|
||||
The layout of the S3 bucket is as follows:
|
||||
```
|
||||
5615610098 // this is an extension build number
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ ├── anon.tar.zst
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ ├── anon.tar.zst
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
5615261079
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── anon.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── anon.tar.zst
|
||||
└── ext_index.json
|
||||
5623261088
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
v14/ext_index.json
|
||||
-- this contains information necessary to create control files
|
||||
v14/extensions/test_ext1.tar.gz
|
||||
-- this contains the library files and sql files necessary to create this extension
|
||||
v14/extensions/custom_ext1.tar.gz
|
||||
```
|
||||
|
||||
Note that build number cannot be part of prefix because we might need extensions
|
||||
from other build numbers.
|
||||
|
||||
`ext_index.json` stores the control files and location of extension archives.
|
||||
It also stores a list of public extensions and a library_index
|
||||
|
||||
We don't need to duplicate `extension.tar.zst`` files.
|
||||
We only need to upload a new one if it is updated.
|
||||
(Although currently we just upload every time anyways, hopefully will change
|
||||
this sometime)
|
||||
|
||||
*access* is controlled by spec
|
||||
|
||||
More specifically, here is an example ext_index.json
|
||||
The difference between private and public extensions is determined by who can
|
||||
load the extension. This is specified in `ext_index.json`.
|
||||
Speicially, `ext_index.json` has a list of public extensions, and a list of
|
||||
extensions enabled for specific tenant-ids. Here is an example `ext_index.json`:
|
||||
```
|
||||
{
|
||||
"public_extensions": [
|
||||
"anon",
|
||||
"pg_buffercache"
|
||||
"enabled_extensions": {
|
||||
"123454321": [
|
||||
"anon"
|
||||
],
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
"pg_buffercache": "pg_buffercache"
|
||||
// for more complex extensions like postgis
|
||||
// we might have something like:
|
||||
// address_standardizer: postgis
|
||||
// postgis_tiger: postgis
|
||||
},
|
||||
"extension_data": {
|
||||
"pg_buffercache": {
|
||||
"control_data": {
|
||||
"pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
|
||||
},
|
||||
"anon": {
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/anon.tar.zst"
|
||||
}
|
||||
}
|
||||
"public": [
|
||||
"embedding"
|
||||
]
|
||||
},
|
||||
"control_data": {
|
||||
"embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true",
|
||||
"anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### How to add new extension to the Extension Storage?
|
||||
|
||||
Simply upload build artifacts to the S3 bucket.
|
||||
Implement a CI step for that. Splitting it from compute-node-image build.
|
||||
Implement a CI step for that. Splitting it from ompute-node-image build.
|
||||
|
||||
### How do we deal with extension versions and updates?
|
||||
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
# Useful development tools
|
||||
|
||||
This readme contains some hints on how to set up some optional development tools.
|
||||
|
||||
## ccls
|
||||
|
||||
[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
|
||||
to work well. There are different ways to do it but here's what works for me:
|
||||
1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
|
||||
2. Go to `vendor/postgres-v15`
|
||||
3. Run `make clean && ./configure`
|
||||
4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
|
||||
5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
|
||||
6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
|
||||
|
||||
With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
|
||||
|
||||
Some additional tips for various IDEs:
|
||||
|
||||
### Emacs
|
||||
|
||||
To improve performance: `(setq lsp-lens-enable nil)`
|
||||
@@ -68,46 +68,14 @@ where
|
||||
/// Response of the /metrics.json API
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct ComputeMetrics {
|
||||
/// Time spent waiting in pool
|
||||
pub wait_for_spec_ms: u64,
|
||||
|
||||
/// Time spent checking if safekeepers are synced
|
||||
pub sync_sk_check_ms: u64,
|
||||
|
||||
/// Time spent syncing safekeepers (walproposer.c).
|
||||
/// In most cases this should be zero.
|
||||
pub sync_safekeepers_ms: u64,
|
||||
|
||||
/// Time it took to establish a pg connection to the pageserver.
|
||||
/// This is two roundtrips, so it's a good proxy for compute-pageserver
|
||||
/// latency. The latency is usually 0.2ms, but it's not safe to assume
|
||||
/// that.
|
||||
pub pageserver_connect_micros: u64,
|
||||
|
||||
/// Time to get basebackup from pageserver and write it to disk.
|
||||
pub basebackup_ms: u64,
|
||||
|
||||
/// Compressed size of basebackup received.
|
||||
pub basebackup_bytes: u64,
|
||||
|
||||
/// Time spent starting potgres. This includes initialization of shared
|
||||
/// buffers, preloading extensions, and other pg operations.
|
||||
pub start_postgres_ms: u64,
|
||||
|
||||
/// Time spent applying pg catalog updates that were made in the console
|
||||
/// UI. This should be 0 when startup time matters, since cplane tries
|
||||
/// to do these updates eagerly, and passes the skip_pg_catalog_updates
|
||||
/// when it's safe to skip this step.
|
||||
pub config_ms: u64,
|
||||
|
||||
/// Total time, from when we receive the spec to when we're ready to take
|
||||
/// pg connections.
|
||||
pub total_startup_ms: u64,
|
||||
pub load_ext_ms: u64,
|
||||
pub num_ext_downloaded: u64,
|
||||
pub largest_ext_size: u64, // these are measured in bytes
|
||||
pub total_ext_download_size: u64,
|
||||
pub prep_extensions_ms: u64,
|
||||
pub load_libraries_ms: u64,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
|
||||
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
|
||||
use rand::Rng;
|
||||
use serde::Serialize;
|
||||
|
||||
#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum EventType {
|
||||
#[serde(rename = "absolute")]
|
||||
@@ -17,32 +17,6 @@ pub enum EventType {
|
||||
},
|
||||
}
|
||||
|
||||
impl EventType {
|
||||
pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
|
||||
use EventType::*;
|
||||
match self {
|
||||
Absolute { time } => Some(time),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
|
||||
// these can most likely be thought of as Range or RangeFull
|
||||
use EventType::*;
|
||||
match self {
|
||||
Incremental {
|
||||
start_time,
|
||||
stop_time,
|
||||
} => Some(start_time..stop_time),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_incremental(&self) -> bool {
|
||||
matches!(self, EventType::Incremental { .. })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
pub struct Event<Extra> {
|
||||
#[serde(flatten)]
|
||||
@@ -57,7 +31,7 @@ pub struct Event<Extra> {
|
||||
pub extra: Extra,
|
||||
}
|
||||
|
||||
pub fn idempotency_key(node_id: &str) -> String {
|
||||
pub fn idempotency_key(node_id: String) -> String {
|
||||
format!(
|
||||
"{}-{}-{:04}",
|
||||
Utc::now(),
|
||||
@@ -71,6 +45,6 @@ pub const CHUNK_SIZE: usize = 1000;
|
||||
// Just a wrapper around a slice of events
|
||||
// to serialize it as `{"events" : [ ] }
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct EventChunk<'a, T: Clone> {
|
||||
pub events: std::borrow::Cow<'a, [T]>,
|
||||
pub struct EventChunk<'a, T> {
|
||||
pub events: &'a [T],
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ use once_cell::sync::Lazy;
|
||||
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
||||
pub use prometheus::opts;
|
||||
pub use prometheus::register;
|
||||
pub use prometheus::Error;
|
||||
pub use prometheus::{core, default_registry, proto};
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_counter_vec, Counter, CounterVec};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
||||
//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use strum_macros;
|
||||
use utils::{
|
||||
completion,
|
||||
history_buffer::HistoryBufferWithDropCounter,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -77,12 +76,7 @@ pub enum TenantState {
|
||||
/// system is being shut down.
|
||||
///
|
||||
/// Transitions out of this state are possible through `set_broken()`.
|
||||
Stopping {
|
||||
// Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
|
||||
// otherwise it will not be skipped during deserialization
|
||||
#[serde(skip)]
|
||||
progress: completion::Barrier,
|
||||
},
|
||||
Stopping,
|
||||
/// The tenant is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations.
|
||||
///
|
||||
@@ -124,7 +118,7 @@ impl TenantState {
|
||||
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
|
||||
// we set the Stopping state irrespective of whether the tenant
|
||||
// has finished attaching or not.
|
||||
Self::Stopping { .. } => Maybe,
|
||||
Self::Stopping => Maybe,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -417,16 +411,12 @@ pub struct LayerResidenceEvent {
|
||||
pub reason: LayerResidenceEventReason,
|
||||
}
|
||||
|
||||
/// The reason for recording a given [`LayerResidenceEvent`].
|
||||
/// The reason for recording a given [`ResidenceEvent`].
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum LayerResidenceEventReason {
|
||||
/// The layer map is being populated, e.g. during timeline load or attach.
|
||||
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
|
||||
/// We need to record such events because there is no persistent storage for the events.
|
||||
///
|
||||
// https://github.com/rust-lang/rust/issues/74481
|
||||
/// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
|
||||
/// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
|
||||
LayerLoad,
|
||||
/// We just created the layer (e.g., freeze_and_flush or compaction).
|
||||
/// Such layers are always [`LayerResidenceStatus::Resident`].
|
||||
@@ -934,13 +924,7 @@ mod tests {
|
||||
"Activating",
|
||||
),
|
||||
(line!(), TenantState::Active, "Active"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Stopping {
|
||||
progress: utils::completion::Barrier::default(),
|
||||
},
|
||||
"Stopping",
|
||||
),
|
||||
(line!(), TenantState::Stopping, "Stopping"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Broken {
|
||||
|
||||
@@ -60,9 +60,8 @@ impl Ord for RelTag {
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// ```text
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
/// ```
|
||||
///
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
|
||||
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
|
||||
// Multixact utils
|
||||
|
||||
pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
|
||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
|
||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
|
||||
}
|
||||
|
||||
pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
|
||||
@@ -81,41 +81,3 @@ fn mx_offset_to_member_page(xid: u32) -> u32 {
|
||||
pub fn mx_offset_to_member_segment(xid: u32) -> i32 {
|
||||
(mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_multixid_calc() {
|
||||
// Check that the mx_offset_* functions produce the same values as the
|
||||
// corresponding PostgreSQL C macros (MXOffsetTo*). These test values
|
||||
// were generated by calling the PostgreSQL macros with a little C
|
||||
// program.
|
||||
assert_eq!(mx_offset_to_member_segment(0), 0);
|
||||
assert_eq!(mx_offset_to_member_page(0), 0);
|
||||
assert_eq!(mx_offset_to_flags_offset(0), 0);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(0), 0);
|
||||
assert_eq!(mx_offset_to_member_offset(0), 4);
|
||||
assert_eq!(mx_offset_to_member_segment(1), 0);
|
||||
assert_eq!(mx_offset_to_member_page(1), 0);
|
||||
assert_eq!(mx_offset_to_flags_offset(1), 0);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(1), 8);
|
||||
assert_eq!(mx_offset_to_member_offset(1), 8);
|
||||
assert_eq!(mx_offset_to_member_segment(123456789), 2358);
|
||||
assert_eq!(mx_offset_to_member_page(123456789), 75462);
|
||||
assert_eq!(mx_offset_to_flags_offset(123456789), 4780);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(123456789), 8);
|
||||
assert_eq!(mx_offset_to_member_offset(123456789), 4788);
|
||||
assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040);
|
||||
assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285);
|
||||
assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16);
|
||||
assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172);
|
||||
assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040);
|
||||
assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285);
|
||||
assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24);
|
||||
assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,16 +49,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
||||
///
|
||||
/// Formats:
|
||||
///
|
||||
/// ```text
|
||||
/// <oid>
|
||||
/// <oid>_<fork name>
|
||||
/// <oid>.<segment number>
|
||||
/// <oid>_<fork name>.<segment number>
|
||||
/// ```
|
||||
///
|
||||
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
||||
///
|
||||
|
||||
@@ -5,11 +5,11 @@
|
||||
//! It is similar to what tokio_util::codec::Framed with appropriate codec
|
||||
//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
|
||||
//! separately without using split from futures::stream::StreamExt (which
|
||||
//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
|
||||
//! allocates box[1] in polling internally). tokio::io::split is used for splitting
|
||||
//! instead. Plus we customize error messages more than a single type for all io
|
||||
//! calls.
|
||||
//!
|
||||
//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
||||
//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
||||
use bytes::{Buf, BytesMut};
|
||||
use std::{
|
||||
future::Future,
|
||||
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
||||
/// Split into owned read and write parts. Beware of potential issues with
|
||||
/// using halves in different tasks on TLS stream:
|
||||
/// <https://github.com/tokio-rs/tls/issues/40>
|
||||
/// https://github.com/tokio-rs/tls/issues/40
|
||||
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
|
||||
let (read_half, write_half) = tokio::io::split(self.stream);
|
||||
let reader = FramedReader {
|
||||
|
||||
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
|
||||
#[derive(Debug)]
|
||||
pub struct FeCloseMessage;
|
||||
|
||||
/// An error occurred while parsing or serializing raw stream into Postgres
|
||||
/// An error occured while parsing or serializing raw stream into Postgres
|
||||
/// messages.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ProtocolError {
|
||||
@@ -934,15 +934,6 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||
let mut terminated = [0; 6];
|
||||
for (i, &elem) in code.iter().enumerate() {
|
||||
terminated[i] = elem;
|
||||
}
|
||||
|
||||
terminated
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -974,3 +965,12 @@ mod tests {
|
||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||
}
|
||||
}
|
||||
|
||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||
let mut terminated = [0; 6];
|
||||
for (i, &elem) in code.iter().enumerate() {
|
||||
terminated[i] = elem;
|
||||
}
|
||||
|
||||
terminated
|
||||
}
|
||||
|
||||
@@ -20,7 +20,6 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
scopeguard.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||
/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
@@ -50,12 +50,6 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct RemotePath(PathBuf);
|
||||
|
||||
impl std::fmt::Display for RemotePath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0.display())
|
||||
}
|
||||
}
|
||||
|
||||
impl RemotePath {
|
||||
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
|
||||
anyhow::ensure!(
|
||||
@@ -65,10 +59,6 @@ impl RemotePath {
|
||||
Ok(Self(relative_path.to_path_buf()))
|
||||
}
|
||||
|
||||
pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
|
||||
Self::new(Path::new(relative_path))
|
||||
}
|
||||
|
||||
pub fn with_base(&self, base_path: &Path) -> PathBuf {
|
||||
base_path.join(&self.0)
|
||||
}
|
||||
|
||||
@@ -151,7 +151,10 @@ impl RemoteStorage for LocalFs {
|
||||
let mut files = vec![];
|
||||
let mut directory_queue = vec![full_path.clone()];
|
||||
|
||||
while let Some(cur_folder) = directory_queue.pop() {
|
||||
while !directory_queue.is_empty() {
|
||||
let cur_folder = directory_queue
|
||||
.pop()
|
||||
.expect("queue cannot be empty: we just checked");
|
||||
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let file_name: PathBuf = entry.file_name().into();
|
||||
|
||||
@@ -10,7 +10,6 @@ use anyhow::Context;
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
|
||||
provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
|
||||
};
|
||||
use aws_credential_types::cache::CredentialsCache;
|
||||
use aws_sdk_s3::{
|
||||
@@ -23,7 +22,6 @@ use aws_sdk_s3::{
|
||||
};
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::{
|
||||
io::{self, AsyncRead},
|
||||
sync::Semaphore,
|
||||
@@ -38,9 +36,82 @@ use crate::{
|
||||
|
||||
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
|
||||
|
||||
pub(super) mod metrics;
|
||||
pub(super) mod metrics {
|
||||
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use self::metrics::{AttemptOutcome, RequestKind};
|
||||
static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"remote_storage_s3_requests_count",
|
||||
"Number of s3 requests of particular type",
|
||||
&["request_type"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"remote_storage_s3_failures_count",
|
||||
"Number of failed s3 requests of particular type",
|
||||
&["request_type"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub fn inc_get_object() {
|
||||
S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
|
||||
}
|
||||
|
||||
pub fn inc_get_object_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["get_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_put_object() {
|
||||
S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
|
||||
}
|
||||
|
||||
pub fn inc_put_object_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["put_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_object() {
|
||||
S3_REQUESTS_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects(count: u64) {
|
||||
S3_REQUESTS_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc_by(count);
|
||||
}
|
||||
|
||||
pub fn inc_delete_object_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects_fail(count: u64) {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc_by(count);
|
||||
}
|
||||
|
||||
pub fn inc_list_objects() {
|
||||
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
|
||||
}
|
||||
|
||||
pub fn inc_list_objects_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["list_objects"])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
@@ -68,29 +139,18 @@ impl S3Bucket {
|
||||
aws_config.bucket_name
|
||||
);
|
||||
|
||||
let region = Some(Region::new(aws_config.bucket_region.clone()));
|
||||
|
||||
let credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try(
|
||||
"env",
|
||||
EnvironmentVariableCredentialsProvider::new(),
|
||||
)
|
||||
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
|
||||
// needed to access remote extensions bucket
|
||||
.or_else("token", {
|
||||
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
|
||||
|
||||
WebIdentityTokenCredentialsProvider::builder()
|
||||
.configure(&provider_conf)
|
||||
.build()
|
||||
})
|
||||
// uses imds v2
|
||||
.or_else("imds", ImdsCredentialsProvider::builder().build())
|
||||
};
|
||||
|
||||
let mut config_builder = Config::builder()
|
||||
.region(region)
|
||||
.region(Region::new(aws_config.bucket_region.clone()))
|
||||
.credentials_cache(CredentialsCache::lazy())
|
||||
.credentials_provider(credentials_provider);
|
||||
|
||||
@@ -140,59 +200,26 @@ impl S3Bucket {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
let path_string = path
|
||||
.get_path()
|
||||
.to_string_lossy()
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.to_string();
|
||||
match &self.prefix_in_bucket {
|
||||
Some(prefix) => prefix.clone() + "/" + &path_string,
|
||||
None => path_string,
|
||||
fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||
let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||
for segment in path.0.iter() {
|
||||
full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
full_path.push_str(segment.to_str().unwrap_or_default());
|
||||
}
|
||||
full_path
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
let started_at = start_counting_cancelled_wait(kind);
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.expect("semaphore is never closed");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
|
||||
permit
|
||||
}
|
||||
|
||||
async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
|
||||
let started_at = start_counting_cancelled_wait(kind);
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
let permit = self
|
||||
.concurrency_limiter
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.expect("semaphore is never closed");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.wait_seconds
|
||||
.observe_elapsed(kind, started_at);
|
||||
permit
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
let kind = RequestKind::Get;
|
||||
let permit = self.owned_permit(kind).await;
|
||||
.context("Concurrency limiter semaphore got closed during S3 download")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
metrics::inc_get_object();
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let get_object = self
|
||||
.client
|
||||
.get_object()
|
||||
@@ -202,34 +229,26 @@ impl S3Bucket {
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
if get_object.is_err() {
|
||||
metrics::inc_get_object_fail();
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
started_at,
|
||||
);
|
||||
}
|
||||
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
|
||||
started_at,
|
||||
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
|
||||
download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
|
||||
permit,
|
||||
object_output.body.into_async_read(),
|
||||
))),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
Err(e) => Err(DownloadError::Other(
|
||||
anyhow::Error::new(e).context("download s3 object"),
|
||||
)),
|
||||
Err(e) => {
|
||||
metrics::inc_get_object_fail();
|
||||
Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Failed to download S3 object: {e}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -260,54 +279,6 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// Times and tracks the outcome of the request.
|
||||
struct TimedDownload<S> {
|
||||
started_at: std::time::Instant,
|
||||
outcome: metrics::AttemptOutcome,
|
||||
#[pin]
|
||||
inner: S
|
||||
}
|
||||
|
||||
impl<S> PinnedDrop for TimedDownload<S> {
|
||||
fn drop(mut this: Pin<&mut Self>) {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> TimedDownload<S> {
|
||||
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
||||
TimedDownload {
|
||||
started_at,
|
||||
outcome: metrics::AttemptOutcome::Cancelled,
|
||||
inner,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
let this = self.project();
|
||||
let before = buf.filled().len();
|
||||
let read = std::task::ready!(this.inner.poll_read(cx, buf));
|
||||
|
||||
let read_eof = buf.filled().len() == before;
|
||||
|
||||
match read {
|
||||
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
|
||||
Ok(()) => { /* still in progress */ }
|
||||
Err(_) => *this.outcome = AttemptOutcome::Err,
|
||||
}
|
||||
|
||||
std::task::Poll::Ready(read)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
/// See the doc for `RemoteStorage::list_prefixes`
|
||||
@@ -316,8 +287,6 @@ impl RemoteStorage for S3Bucket {
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let kind = RequestKind::List;
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
@@ -334,11 +303,15 @@ impl RemoteStorage for S3Bucket {
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
metrics::inc_list_objects();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let fetch_response = self
|
||||
.client
|
||||
@@ -355,15 +328,7 @@ impl RemoteStorage for S3Bucket {
|
||||
e
|
||||
})
|
||||
.context("Failed to list S3 prefixes")
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &fetch_response, started_at);
|
||||
|
||||
let fetch_response = fetch_response?;
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
document_keys.extend(
|
||||
fetch_response
|
||||
@@ -373,10 +338,10 @@ impl RemoteStorage for S3Bucket {
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
continuation_token = match fetch_response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
match fetch_response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok(document_keys)
|
||||
@@ -384,19 +349,27 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let kind = RequestKind::List;
|
||||
|
||||
let folder_name = folder
|
||||
let mut folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// remove leading "/" if one exists
|
||||
if let Some(folder_name_slash) = folder_name.clone() {
|
||||
if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
folder_name = Some(folder_name_slash[1..].to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list_files")?;
|
||||
metrics::inc_list_objects();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let response = self
|
||||
.client
|
||||
@@ -411,14 +384,7 @@ impl RemoteStorage for S3Bucket {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})
|
||||
.context("Failed to list files in S3 bucket");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let response = response?;
|
||||
.context("Failed to list files in S3 bucket")?;
|
||||
|
||||
for object in response.contents().unwrap_or_default() {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
@@ -440,17 +406,18 @@ impl RemoteStorage for S3Bucket {
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Put;
|
||||
let _guard = self.permit(kind).await;
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 upload")?;
|
||||
|
||||
metrics::inc_put_object();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
||||
let bytes_stream = ByteStream::new(SdkBody::from(body));
|
||||
|
||||
let res = self
|
||||
.client
|
||||
self.client
|
||||
.put_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(to))
|
||||
@@ -462,25 +429,15 @@ impl RemoteStorage for S3Bucket {
|
||||
.map_err(|e| {
|
||||
metrics::inc_put_object_fail();
|
||||
e
|
||||
});
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
res?;
|
||||
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
// if prefix is not none then download file `prefix/from`
|
||||
// if prefix is none then download file `from`
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
range: None,
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
}
|
||||
@@ -507,8 +464,11 @@ impl RemoteStorage for S3Bucket {
|
||||
.await
|
||||
}
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let _guard = self.permit(kind).await;
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||
|
||||
let mut delete_objects = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
@@ -520,7 +480,6 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
|
||||
metrics::inc_delete_objects(chunk.len() as u64);
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
@@ -530,11 +489,6 @@ impl RemoteStorage for S3Bucket {
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &resp, started_at);
|
||||
|
||||
match resp {
|
||||
Ok(resp) => {
|
||||
if let Some(errors) = resp.errors {
|
||||
@@ -555,14 +509,15 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let _guard = self.permit(kind).await;
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||
|
||||
metrics::inc_delete_object();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let res = self
|
||||
.client
|
||||
self.client
|
||||
.delete_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(path))
|
||||
@@ -571,97 +526,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.map_err(|e| {
|
||||
metrics::inc_delete_object_fail();
|
||||
e
|
||||
});
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
res?;
|
||||
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
|
||||
fn start_counting_cancelled_wait(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
|
||||
metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
|
||||
})
|
||||
}
|
||||
|
||||
/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
|
||||
fn start_measuring_requests(
|
||||
kind: RequestKind,
|
||||
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
|
||||
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Cancelled,
|
||||
started_at,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::{RemotePath, S3Bucket, S3Config};
|
||||
|
||||
#[test]
|
||||
fn relative_path() {
|
||||
let all_paths = vec!["", "some/path", "some/path/"];
|
||||
let all_paths: Vec<RemotePath> = all_paths
|
||||
.iter()
|
||||
.map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
|
||||
.collect();
|
||||
let prefixes = [
|
||||
None,
|
||||
Some(""),
|
||||
Some("test/prefix"),
|
||||
Some("test/prefix/"),
|
||||
Some("/test/prefix/"),
|
||||
];
|
||||
let expected_outputs = vec![
|
||||
vec!["", "some/path", "some/path"],
|
||||
vec!["/", "/some/path", "/some/path"],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
];
|
||||
|
||||
for (prefix_idx, prefix) in prefixes.iter().enumerate() {
|
||||
let config = S3Config {
|
||||
bucket_name: "bucket".to_owned(),
|
||||
bucket_region: "region".to_owned(),
|
||||
prefix_in_bucket: prefix.map(str::to_string),
|
||||
endpoint: None,
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: Some(5),
|
||||
};
|
||||
let storage = S3Bucket::new(&config).expect("remote storage init");
|
||||
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
|
||||
let result = storage.relative_path_to_s3_object(test_path);
|
||||
let expected = expected_outputs[prefix_idx][test_path_idx];
|
||||
assert_eq!(result, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,243 +0,0 @@
|
||||
use metrics::{register_histogram_vec, register_int_counter_vec, Histogram, IntCounter};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub(super) enum RequestKind {
|
||||
Get = 0,
|
||||
Put = 1,
|
||||
Delete = 2,
|
||||
List = 3,
|
||||
}
|
||||
|
||||
use RequestKind::*;
|
||||
|
||||
impl RequestKind {
|
||||
const fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Get => "get_object",
|
||||
Put => "put_object",
|
||||
Delete => "delete_object",
|
||||
List => "list_objects",
|
||||
}
|
||||
}
|
||||
const fn as_index(&self) -> usize {
|
||||
*self as usize
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct RequestTyped<C>([C; 4]);
|
||||
|
||||
impl<C> RequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||
&self.0[kind.as_index()]
|
||||
}
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||
use RequestKind::*;
|
||||
let mut it = [Get, Put, Delete, List].into_iter();
|
||||
let arr = std::array::from_fn::<C, 4, _>(|index| {
|
||||
let next = it.next().unwrap();
|
||||
assert_eq!(index, next.as_index());
|
||||
f(next)
|
||||
});
|
||||
|
||||
if let Some(next) = it.next() {
|
||||
panic!("unexpected {next:?}");
|
||||
}
|
||||
|
||||
RequestTyped(arr)
|
||||
}
|
||||
}
|
||||
|
||||
impl RequestTyped<Histogram> {
|
||||
pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
|
||||
self.get(kind).observe(started_at.elapsed().as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct PassFailCancelledRequestTyped<C> {
|
||||
success: RequestTyped<C>,
|
||||
fail: RequestTyped<C>,
|
||||
cancelled: RequestTyped<C>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(super) enum AttemptOutcome {
|
||||
Ok,
|
||||
Err,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl<T, E> From<&Result<T, E>> for AttemptOutcome {
|
||||
fn from(value: &Result<T, E>) -> Self {
|
||||
match value {
|
||||
Ok(_) => AttemptOutcome::Ok,
|
||||
Err(_) => AttemptOutcome::Err,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AttemptOutcome {
|
||||
pub(super) fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
AttemptOutcome::Ok => "ok",
|
||||
AttemptOutcome::Err => "err",
|
||||
AttemptOutcome::Cancelled => "cancelled",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<C> PassFailCancelledRequestTyped<C> {
|
||||
pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
|
||||
let target = match outcome {
|
||||
AttemptOutcome::Ok => &self.success,
|
||||
AttemptOutcome::Err => &self.fail,
|
||||
AttemptOutcome::Cancelled => &self.cancelled,
|
||||
};
|
||||
target.get(kind)
|
||||
}
|
||||
|
||||
fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
|
||||
let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
|
||||
let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
|
||||
let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
|
||||
|
||||
PassFailCancelledRequestTyped {
|
||||
success,
|
||||
fail,
|
||||
cancelled,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PassFailCancelledRequestTyped<Histogram> {
|
||||
pub(super) fn observe_elapsed(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
outcome: impl Into<AttemptOutcome>,
|
||||
started_at: std::time::Instant,
|
||||
) {
|
||||
self.get(kind, outcome.into())
|
||||
.observe(started_at.elapsed().as_secs_f64())
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct BucketMetrics {
|
||||
/// Total requests attempted
|
||||
// TODO: remove after next release and migrate dashboards to `sum by (result) (remote_storage_s3_requests_count)`
|
||||
requests: RequestTyped<IntCounter>,
|
||||
/// Subset of attempted requests failed
|
||||
// TODO: remove after next release and migrate dashboards to `remote_storage_s3_requests_count{result="err"}`
|
||||
failed: RequestTyped<IntCounter>,
|
||||
|
||||
pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
|
||||
pub(super) wait_seconds: RequestTyped<Histogram>,
|
||||
|
||||
/// Track how many semaphore awaits were cancelled per request type.
|
||||
///
|
||||
/// This is in case cancellations are happening more than expected.
|
||||
pub(super) cancelled_waits: RequestTyped<IntCounter>,
|
||||
}
|
||||
|
||||
impl Default for BucketMetrics {
|
||||
fn default() -> Self {
|
||||
let requests = register_int_counter_vec!(
|
||||
"remote_storage_s3_requests_count",
|
||||
"Number of s3 requests of particular type",
|
||||
&["request_type"],
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
let requests =
|
||||
RequestTyped::build_with(|kind| requests.with_label_values(&[kind.as_str()]));
|
||||
|
||||
let failed = register_int_counter_vec!(
|
||||
"remote_storage_s3_failures_count",
|
||||
"Number of failed s3 requests of particular type",
|
||||
&["request_type"],
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
let failed = RequestTyped::build_with(|kind| failed.with_label_values(&[kind.as_str()]));
|
||||
|
||||
let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
|
||||
|
||||
let req_seconds = register_histogram_vec!(
|
||||
"remote_storage_s3_request_seconds",
|
||||
"Seconds to complete a request",
|
||||
&["request_type", "result"],
|
||||
buckets.to_vec(),
|
||||
)
|
||||
.unwrap();
|
||||
let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
|
||||
req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
|
||||
});
|
||||
|
||||
let wait_seconds = register_histogram_vec!(
|
||||
"remote_storage_s3_wait_seconds",
|
||||
"Seconds rate limited",
|
||||
&["request_type"],
|
||||
buckets.to_vec(),
|
||||
)
|
||||
.unwrap();
|
||||
let wait_seconds =
|
||||
RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
|
||||
|
||||
let cancelled_waits = register_int_counter_vec!(
|
||||
"remote_storage_s3_cancelled_waits_total",
|
||||
"Times a semaphore wait has been cancelled per request type",
|
||||
&["request_type"],
|
||||
)
|
||||
.unwrap();
|
||||
let cancelled_waits =
|
||||
RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
|
||||
|
||||
Self {
|
||||
requests,
|
||||
failed,
|
||||
req_seconds,
|
||||
wait_seconds,
|
||||
cancelled_waits,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn inc_get_object() {
|
||||
BUCKET_METRICS.requests.get(Get).inc()
|
||||
}
|
||||
|
||||
pub fn inc_get_object_fail() {
|
||||
BUCKET_METRICS.failed.get(Get).inc()
|
||||
}
|
||||
|
||||
pub fn inc_put_object() {
|
||||
BUCKET_METRICS.requests.get(Put).inc()
|
||||
}
|
||||
|
||||
pub fn inc_put_object_fail() {
|
||||
BUCKET_METRICS.failed.get(Put).inc()
|
||||
}
|
||||
|
||||
pub fn inc_delete_object() {
|
||||
BUCKET_METRICS.requests.get(Delete).inc()
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects(count: u64) {
|
||||
BUCKET_METRICS.requests.get(Delete).inc_by(count)
|
||||
}
|
||||
|
||||
pub fn inc_delete_object_fail() {
|
||||
BUCKET_METRICS.failed.get(Delete).inc()
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects_fail(count: u64) {
|
||||
BUCKET_METRICS.failed.get(Delete).inc_by(count)
|
||||
}
|
||||
|
||||
pub fn inc_list_objects() {
|
||||
BUCKET_METRICS.requests.get(List).inc()
|
||||
}
|
||||
|
||||
pub fn inc_list_objects_fail() {
|
||||
BUCKET_METRICS.failed.get(List).inc()
|
||||
}
|
||||
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
const BASE_PREFIX: &str = "test/";
|
||||
|
||||
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
||||
|
||||
@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
|
||||
/// `Segment` which has had its size calculated.
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
#[derive(Clone, Debug)]
|
||||
struct SegmentSize {
|
||||
method: SegmentMethod,
|
||||
|
||||
@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
|
||||
/// directly into HTTP servers. However, I couldn't find one for Hyper,
|
||||
/// so I had to write our own. OpenTelemetry website has a registry of
|
||||
/// instrumentation libraries at:
|
||||
/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
|
||||
/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
|
||||
/// If a Hyper crate appears, consider switching to that.
|
||||
pub async fn tracing_handler<F, R>(
|
||||
req: Request<Body>,
|
||||
|
||||
@@ -42,10 +42,6 @@ workspace_hack.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
tokio-stream = { version = "0.1.14" }
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::id::TenantId;
|
||||
/// Algorithm to use. We require EdDSA.
|
||||
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum Scope {
|
||||
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||
|
||||
@@ -12,13 +12,6 @@ pub struct Completion(mpsc::Sender<()>);
|
||||
#[derive(Clone)]
|
||||
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
|
||||
|
||||
impl Default for Barrier {
|
||||
fn default() -> Self {
|
||||
let (_, rx) = channel();
|
||||
rx
|
||||
}
|
||||
}
|
||||
|
||||
impl Barrier {
|
||||
pub async fn wait(self) {
|
||||
self.0.lock().await.recv().await;
|
||||
@@ -31,15 +24,6 @@ impl Barrier {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Barrier {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// we don't use dyn so this is good
|
||||
Arc::ptr_eq(&self.0, &other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Barrier {}
|
||||
|
||||
/// Create new Guard and Barrier pair.
|
||||
pub fn channel() -> (Completion, Barrier) {
|
||||
let (tx, rx) = mpsc::channel::<()>(1);
|
||||
|
||||
@@ -1,111 +0,0 @@
|
||||
/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
|
||||
///
|
||||
/// It can be used with `anyhow::Error` as well.
|
||||
///
|
||||
/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
|
||||
/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
|
||||
/// formatting.
|
||||
///
|
||||
/// ## Usage
|
||||
///
|
||||
/// ```rust
|
||||
/// #[derive(Debug, thiserror::Error)]
|
||||
/// enum MyCoolError {
|
||||
/// #[error("should never happen")]
|
||||
/// Bad(#[source] std::io::Error),
|
||||
/// }
|
||||
///
|
||||
/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
|
||||
///
|
||||
/// # fn main() {
|
||||
/// use utils::error::report_compact_sources;
|
||||
///
|
||||
/// if let Err(e) = failing_call() {
|
||||
/// let e = report_compact_sources(&e);
|
||||
/// assert_eq!(format!("{e}"), "should never happen: permission denied");
|
||||
/// }
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// ## TODO
|
||||
///
|
||||
/// When we are able to describe return position impl trait in traits, this should of course be an
|
||||
/// extension trait. Until then avoid boxing with this more ackward interface.
|
||||
pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
|
||||
struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
|
||||
|
||||
impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)?;
|
||||
|
||||
// why is E a generic parameter here? hope that rustc will see through a default
|
||||
// Error::source implementation and leave the following out if there cannot be any
|
||||
// sources:
|
||||
Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
|
||||
}
|
||||
}
|
||||
|
||||
struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
|
||||
|
||||
impl<'a> Iterator for Sources<'a> {
|
||||
type Item = &'a (dyn std::error::Error + 'static);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let rem = self.0;
|
||||
|
||||
let next = self.0.and_then(|x| x.source());
|
||||
self.0 = next;
|
||||
rem
|
||||
}
|
||||
}
|
||||
|
||||
AnyhowDisplayAlternateAlike(e)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::report_compact_sources;
|
||||
|
||||
#[test]
|
||||
fn report_compact_sources_examples() {
|
||||
use std::fmt::Write;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
enum EvictionError {
|
||||
#[error("cannot evict a remote layer")]
|
||||
CannotEvictRemoteLayer,
|
||||
#[error("stat failed")]
|
||||
StatFailed(#[source] std::io::Error),
|
||||
#[error("layer was no longer part of LayerMap")]
|
||||
LayerNotFound(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
let examples = [
|
||||
(
|
||||
line!(),
|
||||
EvictionError::CannotEvictRemoteLayer,
|
||||
"cannot evict a remote layer",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
|
||||
"stat failed: permission denied",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
|
||||
"layer was no longer part of LayerMap: foobar",
|
||||
),
|
||||
];
|
||||
|
||||
let mut s = String::new();
|
||||
|
||||
for (line, example, expected) in examples {
|
||||
s.clear();
|
||||
|
||||
write!(s, "{}", report_compact_sources(&example)).expect("string grows");
|
||||
|
||||
assert_eq!(s, expected, "example on line {line}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -24,29 +24,12 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
|
||||
Ok(dir.next_entry().await?.is_none())
|
||||
}
|
||||
|
||||
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
|
||||
where
|
||||
F: Fn() -> io::Result<()>,
|
||||
{
|
||||
fs_operation().or_else(ignore_not_found)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::fs_ext::is_directory_empty;
|
||||
|
||||
use super::ignore_absent_files;
|
||||
|
||||
#[test]
|
||||
fn is_empty_dir() {
|
||||
use super::PathExt;
|
||||
@@ -92,21 +75,4 @@ mod test {
|
||||
std::fs::remove_file(&file_path).unwrap();
|
||||
assert!(is_directory_empty(file_path).await.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignore_absent_files_works() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let dir_path = dir.path();
|
||||
|
||||
let file_path: PathBuf = dir_path.join("testfile");
|
||||
|
||||
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
|
||||
|
||||
let f = std::fs::File::create(&file_path).unwrap();
|
||||
drop(f);
|
||||
|
||||
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
|
||||
|
||||
assert!(!file_path.exists());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
@@ -147,140 +148,26 @@ impl Drop for RequestCancelled {
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
|
||||
let body = Body::wrap_stream(ReceiverStream::new(rx));
|
||||
|
||||
let mut writer = ChannelWriter::new(128 * 1024, tx);
|
||||
|
||||
let mut buffer = vec![];
|
||||
let encoder = TextEncoder::new();
|
||||
|
||||
let metrics = tokio::task::spawn_blocking(move || {
|
||||
// Currently we take a lot of mutexes while collecting metrics, so it's
|
||||
// better to spawn a blocking task to avoid blocking the event loop.
|
||||
metrics::gather()
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
encoder.encode(&metrics, &mut buffer).unwrap();
|
||||
|
||||
let response = Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, encoder.format_type())
|
||||
.body(body)
|
||||
.body(Body::from(buffer))
|
||||
.unwrap();
|
||||
|
||||
let span = info_span!("blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _span = span.entered();
|
||||
let metrics = metrics::gather();
|
||||
let res = encoder
|
||||
.encode(&metrics, &mut writer)
|
||||
.and_then(|_| writer.flush().map_err(|e| e.into()));
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
bytes = writer.flushed_bytes(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"responded /metrics"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("failed to write out /metrics response: {e:#}");
|
||||
// semantics of this error are quite... unclear. we want to error the stream out to
|
||||
// abort the response to somehow notify the client that we failed.
|
||||
//
|
||||
// though, most likely the reason for failure is that the receiver is already gone.
|
||||
drop(
|
||||
writer
|
||||
.tx
|
||||
.blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
|
||||
/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
|
||||
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<Option<T>, ApiError> {
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use std::ffi::OsStr;
|
||||
use std::{fmt, str::FromStr};
|
||||
|
||||
use anyhow::Context;
|
||||
use hex::FromHex;
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -215,18 +213,6 @@ pub struct TimelineId(Id);
|
||||
|
||||
id_newtype!(TimelineId);
|
||||
|
||||
impl TryFrom<Option<&OsStr>> for TimelineId {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
|
||||
value
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| format!("Could not parse timeline id from {:?}", value))
|
||||
}
|
||||
}
|
||||
|
||||
/// Neon Tenant Id represents identifiar of a particular tenant.
|
||||
/// Is used for distinguishing requests and data belonging to different users.
|
||||
///
|
||||
|
||||
@@ -63,9 +63,6 @@ pub mod rate_limit;
|
||||
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
||||
pub mod completion;
|
||||
|
||||
/// Reporting utilities
|
||||
pub mod error;
|
||||
|
||||
mod failpoint_macro_helpers {
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
@@ -133,8 +130,8 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
||||
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
||||
///
|
||||
/// #############################################################################################
|
||||
/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
|
||||
/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
|
||||
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
||||
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
||||
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
//! A module to create and read lock files.
|
||||
//!
|
||||
//! File locking is done using [`fcntl::flock`] exclusive locks.
|
||||
//! The only consumer of this module is currently
|
||||
//! [`pid_file`](crate::pid_file). See the module-level comment
|
||||
//! there for potential pitfalls with lock files that are used
|
||||
//! to store PIDs (pidfiles).
|
||||
//! The only consumer of this module is currently [`pid_file`].
|
||||
//! See the module-level comment there for potential pitfalls
|
||||
//! with lock files that are used to store PIDs (pidfiles).
|
||||
|
||||
use std::{
|
||||
fs,
|
||||
@@ -82,7 +81,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
|
||||
}
|
||||
|
||||
/// Returned by [`read_and_hold_lock_file`].
|
||||
/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
|
||||
/// Check out the [`pid_file`] module for what the variants mean
|
||||
/// and potential caveats if the lock files that are used to store PIDs.
|
||||
pub enum LockFileRead {
|
||||
/// No file exists at the given path.
|
||||
|
||||
@@ -112,7 +112,7 @@ pub fn init(
|
||||
///
|
||||
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
||||
/// If the assumptions about the initialization order are not held, use
|
||||
/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// lost.
|
||||
#[must_use]
|
||||
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
||||
|
||||
@@ -23,9 +23,9 @@ pub enum SeqWaitError {
|
||||
|
||||
/// Monotonically increasing value
|
||||
///
|
||||
/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
|
||||
/// It is handy to store some other fields under the same mutex in SeqWait<S>
|
||||
/// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
|
||||
/// any type that can expose counter. `V` is the type of exposed counter.
|
||||
/// any type that can expose counter. <V> is the type of exposed counter.
|
||||
pub trait MonotonicCounter<V> {
|
||||
/// Bump counter value and check that it goes forward
|
||||
/// N.B.: new_val is an actual new value, not a difference.
|
||||
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
|
||||
/// [`wait_for`]: SeqWait::wait_for
|
||||
/// [`advance`]: SeqWait::advance
|
||||
///
|
||||
/// `S` means Storage, `V` is type of counter that this storage exposes.
|
||||
/// <S> means Storage, <V> is type of counter that this storage exposes.
|
||||
///
|
||||
pub struct SeqWait<S, V>
|
||||
where
|
||||
|
||||
@@ -1,15 +1,8 @@
|
||||
//! Assert that the current [`tracing::Span`] has a given set of fields.
|
||||
//!
|
||||
//! Can only produce meaningful positive results when tracing has been configured as in example.
|
||||
//! Absence of `tracing_error::ErrorLayer` is not detected yet.
|
||||
//!
|
||||
//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
|
||||
//! is completly unconfigured.
|
||||
//!
|
||||
//! # Usage
|
||||
//!
|
||||
//! ```rust
|
||||
//! # fn main() {
|
||||
//! ```
|
||||
//! use tracing_subscriber::prelude::*;
|
||||
//! let registry = tracing_subscriber::registry()
|
||||
//! .with(tracing_error::ErrorLayer::default());
|
||||
@@ -27,18 +20,23 @@
|
||||
//!
|
||||
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
|
||||
//! if let Err(missing) = check_fields_present!([&extractor]) {
|
||||
//! // if you copypaste this to a custom assert method, remember to add #[track_caller]
|
||||
//! // to get the "user" code location for the panic.
|
||||
//! panic!("Missing fields: {missing:?}");
|
||||
//! match check_fields_present([&extractor]) {
|
||||
//! Ok(()) => {},
|
||||
//! Err(missing) => {
|
||||
//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
|
||||
//! }
|
||||
//! }
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
|
||||
//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
//!
|
||||
|
||||
#[derive(Debug)]
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::{self},
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
pub enum ExtractionResult {
|
||||
Present,
|
||||
Absent,
|
||||
@@ -73,101 +71,49 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks that the given extractors are satisfied with the current span hierarchy.
|
||||
///
|
||||
/// This should not be called directly, but used through [`check_fields_present`] which allows
|
||||
/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
|
||||
#[doc(hidden)]
|
||||
pub fn check_fields_present0<const L: usize>(
|
||||
must_be_present: [&dyn Extractor; L],
|
||||
) -> Result<Summary, Vec<&dyn Extractor>> {
|
||||
let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
|
||||
let trace = tracing_error::SpanTrace::capture();
|
||||
trace.with_spans(|md, _formatted_fields| {
|
||||
// when trying to understand the inner workings of how does the matching work, note that
|
||||
// this closure might be called zero times if the span is disabled. normally it is called
|
||||
// once per span hierarchy level.
|
||||
missing.retain(|extractor| match extractor.extract(md.fields()) {
|
||||
ExtractionResult::Present => false,
|
||||
ExtractionResult::Absent => true,
|
||||
});
|
||||
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
||||
|
||||
// continue walking up until we've found all missing
|
||||
!missing.is_empty()
|
||||
});
|
||||
if missing.is_empty() {
|
||||
Ok(Summary::FoundEverything)
|
||||
} else if !tracing_subscriber_configured() {
|
||||
Ok(Summary::Unconfigured)
|
||||
} else {
|
||||
// we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
|
||||
// missing, which can be annoying. for this case, we could probably use
|
||||
// SpanTrace::status().
|
||||
//
|
||||
// another way to end up here is with RUST_LOG=pageserver=off while configuring the
|
||||
// logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
|
||||
// this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
|
||||
Err(missing)
|
||||
impl<'a> MemoryIdentity<'a> {
|
||||
fn as_ptr(&self) -> *const () {
|
||||
self.0 as *const _ as *const ()
|
||||
}
|
||||
}
|
||||
impl<'a> PartialEq for MemoryIdentity<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_ptr() == other.as_ptr()
|
||||
}
|
||||
}
|
||||
impl<'a> Eq for MemoryIdentity<'a> {}
|
||||
impl<'a> Hash for MemoryIdentity<'a> {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.as_ptr().hash(state);
|
||||
}
|
||||
}
|
||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks that the given extractors are satisfied with the current span hierarchy.
|
||||
///
|
||||
/// The macro is the preferred way of checking if fields exist while passing checks if a test does
|
||||
/// not have tracing configured.
|
||||
///
|
||||
/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
|
||||
/// However we can game a module namespaced macro for `use` purposes by re-exporting the
|
||||
/// #[macro_export] exported name with an alias (below).
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! __check_fields_present {
|
||||
($extractors:expr) => {{
|
||||
{
|
||||
use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
|
||||
|
||||
match check_fields_present0($extractors) {
|
||||
Ok(FoundEverything) => Ok(()),
|
||||
Ok(Unconfigured) if cfg!(test) => {
|
||||
// allow unconfigured in tests
|
||||
Ok(())
|
||||
},
|
||||
Ok(Unconfigured) => {
|
||||
panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
|
||||
},
|
||||
Err(missing) => Err(missing)
|
||||
}
|
||||
}
|
||||
}}
|
||||
}
|
||||
|
||||
pub use crate::__check_fields_present as check_fields_present;
|
||||
|
||||
/// Explanation for why the check was deemed ok.
|
||||
///
|
||||
/// Mainly useful for testing, or configuring per-crate behaviour as in with
|
||||
/// [`check_fields_present`].
|
||||
#[derive(Debug)]
|
||||
pub enum Summary {
|
||||
/// All extractors were found.
|
||||
///
|
||||
/// Should only happen when tracing is properly configured.
|
||||
FoundEverything,
|
||||
|
||||
/// Tracing has not been configured at all. This is ok for tests running without tracing set
|
||||
/// up.
|
||||
Unconfigured,
|
||||
}
|
||||
|
||||
fn tracing_subscriber_configured() -> bool {
|
||||
let mut noop_configured = false;
|
||||
tracing::dispatcher::get_default(|d| {
|
||||
// it is possible that this closure will not be invoked, but the current implementation
|
||||
// always invokes it
|
||||
noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
|
||||
/// The extractor names passed as keys to [`new`].
|
||||
pub fn check_fields_present<const L: usize>(
|
||||
must_be_present: [&dyn Extractor; L],
|
||||
) -> Result<(), Vec<&dyn Extractor>> {
|
||||
let mut missing: HashSet<MemoryIdentity> =
|
||||
HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
|
||||
let trace = tracing_error::SpanTrace::capture();
|
||||
trace.with_spans(|md, _formatted_fields| {
|
||||
missing.retain(|extractor| match extractor.0.extract(md.fields()) {
|
||||
ExtractionResult::Present => false,
|
||||
ExtractionResult::Absent => true,
|
||||
});
|
||||
!missing.is_empty() // continue walking up until we've found all missing
|
||||
});
|
||||
|
||||
!noop_configured
|
||||
if missing.is_empty() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(missing.into_iter().map(|mi| mi.0).collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -177,36 +123,6 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::{self},
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
||||
|
||||
impl<'a> MemoryIdentity<'a> {
|
||||
fn as_ptr(&self) -> *const () {
|
||||
self.0 as *const _ as *const ()
|
||||
}
|
||||
}
|
||||
impl<'a> PartialEq for MemoryIdentity<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_ptr() == other.as_ptr()
|
||||
}
|
||||
}
|
||||
impl<'a> Eq for MemoryIdentity<'a> {}
|
||||
impl<'a> Hash for MemoryIdentity<'a> {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.as_ptr().hash(state);
|
||||
}
|
||||
}
|
||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||
}
|
||||
}
|
||||
|
||||
struct Setup {
|
||||
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
|
||||
tenant_extractor: MultiNameExtractor<2>,
|
||||
@@ -243,8 +159,7 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -252,8 +167,8 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
|
||||
.unwrap_err();
|
||||
let missing =
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -270,8 +185,7 @@ mod tests {
|
||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -284,7 +198,7 @@ mod tests {
|
||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -293,8 +207,7 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let res = check_fields_present0([&setup.tenant_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -310,8 +223,7 @@ mod tests {
|
||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let res = check_fields_present0([&setup.tenant_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -319,7 +231,7 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -333,107 +245,43 @@ mod tests {
|
||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tracing_error_subscriber_not_set_up_straight_line() {
|
||||
fn tracing_error_subscriber_not_set_up() {
|
||||
// no setup
|
||||
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
|
||||
// similarly for a not found key
|
||||
let extractor = MultiNameExtractor::new("F", ["foobar"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tracing_error_subscriber_not_set_up_with_instrument() {
|
||||
// no setup
|
||||
|
||||
// demo a case where span entering is used to establish a parent child connection, but
|
||||
// when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let subspan = tracing::info_span!("bar", f = "foobar");
|
||||
drop(_guard);
|
||||
|
||||
// normally this would work, but without any tracing-subscriber configured, both
|
||||
// check_field_present find nothing
|
||||
let _guard = subspan.enter();
|
||||
let extractors: [&dyn Extractor; 2] = [
|
||||
&MultiNameExtractor::new("E", ["e"]),
|
||||
&MultiNameExtractor::new("F", ["f"]),
|
||||
];
|
||||
|
||||
let res = check_fields_present0(extractors);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
|
||||
// similarly for a not found key
|
||||
let extractor = MultiNameExtractor::new("G", ["g"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tracing_subscriber_configured() {
|
||||
// this will fail if any utils::logging::init callers appear, but let's hope they do not
|
||||
// appear.
|
||||
assert!(!super::tracing_subscriber_configured());
|
||||
|
||||
let _g = setup_current_thread();
|
||||
|
||||
assert!(super::tracing_subscriber_configured());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_found_when_disabled_by_filter() {
|
||||
#[should_panic]
|
||||
fn panics_if_tracing_error_subscriber_has_wrong_filter() {
|
||||
let r = tracing_subscriber::registry().with({
|
||||
tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
|
||||
|md| !(md.is_span() && *md.level() == tracing::Level::INFO),
|
||||
))
|
||||
tracing_error::ErrorLayer::default().with_filter(
|
||||
tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
|
||||
if md.is_span() && *md.level() == tracing::Level::INFO {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}),
|
||||
)
|
||||
});
|
||||
|
||||
let _guard = tracing::subscriber::set_default(r);
|
||||
|
||||
// this test is a rather tricky one, it has a number of possible outcomes depending on the
|
||||
// execution order when executed with other tests even if no test sets the global default
|
||||
// subscriber.
|
||||
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
|
||||
|
||||
if span.is_disabled() {
|
||||
// the tests are running single threaded, or we got lucky and no other tests subscriber
|
||||
// was got to register their per-CALLSITE::META interest between `set_default` and
|
||||
// creation of the span, thus the filter got to apply and registered interest of Never,
|
||||
// so the span was never created.
|
||||
//
|
||||
// as the span is disabled, no keys were recorded to it, leading check_fields_present0
|
||||
// to find an error.
|
||||
|
||||
let missing = check_fields_present0(extractors).unwrap_err();
|
||||
assert_missing(missing, vec![extractors[0]]);
|
||||
} else {
|
||||
// when the span is enabled, it is because some other test is running at the same time,
|
||||
// and that tests registry has filters which are interested in our above span.
|
||||
//
|
||||
// because the span is now enabled, all keys will be found for it. the
|
||||
// tracing_error::SpanTrace does not consider layer filters during the span hierarchy
|
||||
// walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
|
||||
// this test-induced issue.
|
||||
|
||||
let res = check_fields_present0(extractors);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&extractor]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,8 +35,6 @@ humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
nix.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
num_cpus = { version = "1.15" }
|
||||
num-traits.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
@@ -84,7 +82,6 @@ strum_macros.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tempfile.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
|
||||
@@ -13,7 +13,6 @@ clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
postgres_ffi.workspace = true
|
||||
tokio.workspace = true
|
||||
utils.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -7,10 +7,10 @@
|
||||
//! - The y axis represents LSN, growing upwards.
|
||||
//!
|
||||
//! Coordinates in both axis are compressed for better readability.
|
||||
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
||||
//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
|
||||
//!
|
||||
//! Example use:
|
||||
//! ```bash
|
||||
//! ```
|
||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||
//! $ firefox out.svg
|
||||
@@ -20,10 +20,9 @@
|
||||
//! or from pageserver log files.
|
||||
//!
|
||||
//! TODO Consider shipping this as a grafana panel plugin:
|
||||
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
|
||||
//! https://grafana.com/tutorials/build-a-panel-plugin/
|
||||
use anyhow::Result;
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::METADATA_FILE_NAME;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
@@ -72,10 +71,6 @@ pub fn main() -> Result<()> {
|
||||
let line = PathBuf::from_str(&line).unwrap();
|
||||
let filename = line.file_name().unwrap();
|
||||
let filename = filename.to_str().unwrap();
|
||||
if filename == METADATA_FILE_NAME {
|
||||
// Don't try and parse "metadata" like a key-lsn range
|
||||
continue;
|
||||
}
|
||||
let range = parse_filename(filename);
|
||||
ranges.push(range);
|
||||
}
|
||||
|
||||
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
}
|
||||
|
||||
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
||||
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
let file = FileBlockReader::new(VirtualFile::open(path)?);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
@@ -107,31 +107,29 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
// min-heap (reserve space for one more element added before eviction)
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev_key: Option<Key> = None;
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, _value| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
if let Some(prev) = prev_key {
|
||||
if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
|
||||
heap.push(Hole(prev..curr));
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
}
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, _value| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
if let Some(prev) = prev_key {
|
||||
if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
|
||||
heap.push(Hole(prev..curr));
|
||||
if heap.len() > max_holes {
|
||||
heap.pop(); // remove smallest hole
|
||||
}
|
||||
}
|
||||
prev_key = Some(curr.next());
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
prev_key = Some(curr.next());
|
||||
true
|
||||
},
|
||||
)?;
|
||||
let mut holes = heap.into_vec();
|
||||
holes.sort_by_key(|hole| hole.0.start);
|
||||
Ok(holes)
|
||||
}
|
||||
|
||||
pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let storage_path = &cmd.path;
|
||||
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
|
||||
|
||||
@@ -162,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if layer_file.is_delta {
|
||||
layer_file.holes = get_holes(&layer.path(), max_holes).await?;
|
||||
layer_file.holes = get_holes(&layer.path(), max_holes)?;
|
||||
n_deltas += 1;
|
||||
}
|
||||
layers.push(layer_file);
|
||||
|
||||
@@ -43,7 +43,8 @@ pub(crate) enum LayerCmd {
|
||||
},
|
||||
}
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
use pageserver::tenant::blob_io::BlobCursor;
|
||||
use pageserver::tenant::block_io::BlockReader;
|
||||
|
||||
let path = path.as_ref();
|
||||
@@ -59,18 +60,16 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
);
|
||||
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
|
||||
let mut all = vec![];
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value_offset| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
all.push((curr, BlobRef(value_offset)));
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let cursor = BlockCursor::new(&file);
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value_offset| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
all.push((curr, BlobRef(value_offset)));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos())?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
@@ -79,7 +78,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
match cmd {
|
||||
LayerCmd::List { path } => {
|
||||
for tenant in fs::read_dir(path.join("tenants"))? {
|
||||
@@ -154,7 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
);
|
||||
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(layer.path()).await?;
|
||||
read_delta_file(layer.path())?;
|
||||
} else {
|
||||
anyhow::bail!("not supported yet :(");
|
||||
}
|
||||
|
||||
@@ -72,13 +72,12 @@ struct AnalyzeLayerMapCmd {
|
||||
max_holes: Option<usize>,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
match cli.command {
|
||||
Commands::Layer(cmd) => {
|
||||
layers::main(&cmd).await?;
|
||||
layers::main(&cmd)?;
|
||||
}
|
||||
Commands::Metadata(cmd) => {
|
||||
handle_metadata(&cmd)?;
|
||||
@@ -87,7 +86,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
draw_timeline_dir::main()?;
|
||||
}
|
||||
Commands::AnalyzeLayerMap(cmd) => {
|
||||
layer_map_analyzer::main(&cmd).await?;
|
||||
layer_map_analyzer::main(&cmd)?;
|
||||
}
|
||||
Commands::PrintLayerFile(cmd) => {
|
||||
if let Err(e) = read_pg_control_file(&cmd.path) {
|
||||
@@ -95,7 +94,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
"Failed to read input file as a pg control one: {e:#}\n\
|
||||
Attempting to read it as layer file"
|
||||
);
|
||||
print_layerfile(&cmd.path).await?;
|
||||
print_layerfile(&cmd.path)?;
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -114,12 +113,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
dump_layerfile_from_path(path, true, &ctx)
|
||||
}
|
||||
|
||||
fn handle_metadata(
|
||||
|
||||
@@ -19,6 +19,12 @@ use tokio::io;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tracing::*;
|
||||
|
||||
/// NB: This relies on a modified version of tokio_tar that does *not* write the
|
||||
/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
|
||||
/// without explicitly calling 'finish' or 'into_inner'!
|
||||
///
|
||||
/// See https://github.com/neondatabase/tokio-tar/pull/1
|
||||
///
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
|
||||
@@ -9,10 +9,8 @@ use clap::{Arg, ArgAction, Command};
|
||||
use fail::FailScenario;
|
||||
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
@@ -40,6 +38,8 @@ const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
const FEATURES: &[&str] = &[
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
#[cfg(feature = "fail/failpoints")]
|
||||
"fail/failpoints",
|
||||
];
|
||||
|
||||
fn version() -> String {
|
||||
@@ -226,19 +226,6 @@ fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<()> {
|
||||
// Monotonic time for later calculating startup duration
|
||||
let started_startup_at = Instant::now();
|
||||
|
||||
let startup_checkpoint = move |phase: &str, human_phase: &str| {
|
||||
let elapsed = started_startup_at.elapsed();
|
||||
let secs = elapsed.as_secs_f64();
|
||||
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"{human_phase} ({secs:.3}s since start)"
|
||||
)
|
||||
};
|
||||
|
||||
// Print version and launch timestamp to the log,
|
||||
// and expose them as prometheus metrics.
|
||||
// A changed version string indicates changed software.
|
||||
@@ -348,11 +335,6 @@ fn start_pageserver(
|
||||
// Set up remote storage client
|
||||
let remote_storage = create_remote_storage_client(conf)?;
|
||||
|
||||
// Up to this point no significant I/O has been done: this should have been fast. Record
|
||||
// duration prior to starting I/O intensive phase of startup.
|
||||
startup_checkpoint("initial", "Starting loading tenants");
|
||||
STARTUP_IS_LOADING.set(1);
|
||||
|
||||
// Startup staging or optimizing:
|
||||
//
|
||||
// We want to minimize downtime for `page_service` connections, and trying not to overload
|
||||
@@ -378,6 +360,7 @@ fn start_pageserver(
|
||||
};
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let init_started_at = std::time::Instant::now();
|
||||
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
|
||||
|
||||
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
@@ -395,25 +378,35 @@ fn start_pageserver(
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
|
||||
|
||||
init_done_rx.wait().await;
|
||||
startup_checkpoint("initial_tenant_load", "Initial load completed");
|
||||
STARTUP_IS_LOADING.set(0);
|
||||
|
||||
// initial logical sizes can now start, as they were waiting on init_done_rx.
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let init_done = std::time::Instant::now();
|
||||
let elapsed = init_done - init_started_at;
|
||||
|
||||
tracing::info!(
|
||||
elapsed_millis = elapsed.as_millis(),
|
||||
"Initial load completed"
|
||||
);
|
||||
|
||||
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
|
||||
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
|
||||
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
|
||||
Ok(_) => {
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
|
||||
let init_sizes_done = tokio::select! {
|
||||
_ = &mut init_sizes_done => {
|
||||
let now = std::time::Instant::now();
|
||||
tracing::info!(
|
||||
from_init_done_millis = (now - init_done).as_millis(),
|
||||
from_init_millis = (now - init_started_at).as_millis(),
|
||||
"Initial logical sizes completed"
|
||||
);
|
||||
None
|
||||
}
|
||||
Err(_) => {
|
||||
_ = tokio::time::sleep(timeout) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
"Initial logical size timeout elapsed; starting background jobs"
|
||||
@@ -426,7 +419,6 @@ fn start_pageserver(
|
||||
|
||||
// allow background jobs to start
|
||||
drop(background_jobs_can_start);
|
||||
startup_checkpoint("background_jobs_can_start", "Starting background jobs");
|
||||
|
||||
if let Some(init_sizes_done) = init_sizes_done {
|
||||
// ending up here is not a bug; at the latest logical sizes will be queried by
|
||||
@@ -436,11 +428,14 @@ fn start_pageserver(
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
|
||||
let now = std::time::Instant::now();
|
||||
tracing::info!(
|
||||
from_init_done_millis = (now - init_done).as_millis(),
|
||||
from_init_millis = (now - init_started_at).as_millis(),
|
||||
"Initial logical sizes completed after timeout (background jobs already started)"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
startup_checkpoint("complete", "Startup complete");
|
||||
};
|
||||
|
||||
async move {
|
||||
|
||||
@@ -33,8 +33,7 @@ use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
pub mod defaults {
|
||||
@@ -172,13 +171,11 @@ pub struct PageServerConf {
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
||||
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
|
||||
/// See the comment in `eviction_task` for details.
|
||||
///
|
||||
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
@@ -602,17 +599,6 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn timeline_delete_mark_file_path(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> PathBuf {
|
||||
path_with_suffix_extension(
|
||||
self.timeline_path(&tenant_id, &timeline_id),
|
||||
TIMELINE_DELETE_MARK_SUFFIX,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
@@ -1007,8 +993,6 @@ impl ConfigurableSemaphore {
|
||||
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
||||
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
|
||||
/// behave like [`futures::future::pending`], just waiting until new permits are added.
|
||||
///
|
||||
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||
pub fn new(initial_permits: NonZeroUsize) -> Self {
|
||||
ConfigurableSemaphore {
|
||||
initial_permits,
|
||||
|
||||
@@ -7,23 +7,27 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::{mgr, LogicalSizeCalculationCause};
|
||||
use anyhow;
|
||||
use chrono::{DateTime, Utc};
|
||||
use chrono::Utc;
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use pageserver_api::models::TenantState;
|
||||
use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
const WRITTEN_SIZE: &str = "written_size";
|
||||
const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
|
||||
const RESIDENT_SIZE: &str = "resident_size";
|
||||
const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
||||
|
||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Debug, Clone, Copy)]
|
||||
#[derive(Serialize, Debug)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
@@ -34,142 +38,10 @@ struct Ids {
|
||||
|
||||
/// Key that uniquely identifies the object, this metric describes.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
struct MetricsKey {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
metric: &'static str,
|
||||
}
|
||||
|
||||
impl MetricsKey {
|
||||
const fn absolute_values(self) -> AbsoluteValueFactory {
|
||||
AbsoluteValueFactory(self)
|
||||
}
|
||||
const fn incremental_values(self) -> IncrementalValueFactory {
|
||||
IncrementalValueFactory(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper type which each individual metric kind can return to produce only absolute values.
|
||||
struct AbsoluteValueFactory(MetricsKey);
|
||||
|
||||
impl AbsoluteValueFactory {
|
||||
fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
|
||||
let key = self.0;
|
||||
(key, (EventType::Absolute { time }, val))
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper type which each individual metric kind can return to produce only incremental values.
|
||||
struct IncrementalValueFactory(MetricsKey);
|
||||
|
||||
impl IncrementalValueFactory {
|
||||
#[allow(clippy::wrong_self_convention)]
|
||||
fn from_previous_up_to(
|
||||
self,
|
||||
prev_end: DateTime<Utc>,
|
||||
up_to: DateTime<Utc>,
|
||||
val: u64,
|
||||
) -> (MetricsKey, (EventType, u64)) {
|
||||
let key = self.0;
|
||||
// cannot assert prev_end < up_to because these are realtime clock based
|
||||
(
|
||||
key,
|
||||
(
|
||||
EventType::Incremental {
|
||||
start_time: prev_end,
|
||||
stop_time: up_to,
|
||||
},
|
||||
val,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn key(&self) -> &MetricsKey {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
// the static part of a MetricsKey
|
||||
impl MetricsKey {
|
||||
/// Absolute value of [`Timeline::get_last_record_lsn`].
|
||||
///
|
||||
/// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
|
||||
const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: "written_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
|
||||
/// previously sent, starting from the previously sent incremental time range ending at the
|
||||
/// latest absolute measurement.
|
||||
const fn written_size_delta(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> IncrementalValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
// the name here is correctly about data not size, because that is what is wanted by
|
||||
// downstream pipeline
|
||||
metric: "written_data_bytes_delta",
|
||||
}
|
||||
.incremental_values()
|
||||
}
|
||||
|
||||
/// Exact [`Timeline::get_current_logical_size`].
|
||||
///
|
||||
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
|
||||
const fn timeline_logical_size(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: "timeline_logical_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`Tenant::remote_size`]
|
||||
///
|
||||
/// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
|
||||
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "remote_storage_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
|
||||
///
|
||||
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
|
||||
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "resident_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
|
||||
///
|
||||
/// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
|
||||
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "synthetic_storage_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
pub struct PageserverConsumptionMetricsKey {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub metric: &'static str,
|
||||
}
|
||||
|
||||
/// Main thread that serves metrics collection
|
||||
@@ -207,7 +79,7 @@ pub async fn collect_metrics(
|
||||
.timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
|
||||
.build()
|
||||
.expect("Failed to create http client with timeout");
|
||||
let mut cached_metrics = HashMap::new();
|
||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
||||
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
@@ -247,15 +119,15 @@ pub async fn collect_metrics(
|
||||
///
|
||||
/// TODO
|
||||
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
|
||||
async fn collect_metrics_iteration(
|
||||
pub async fn collect_metrics_iteration(
|
||||
client: &reqwest::Client,
|
||||
cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
|
||||
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
node_id: NodeId,
|
||||
ctx: &RequestContext,
|
||||
send_cached: bool,
|
||||
) {
|
||||
let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
|
||||
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
|
||||
trace!(
|
||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
@@ -289,65 +161,99 @@ async fn collect_metrics_iteration(
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
// iterate through list of timelines in tenant
|
||||
for timeline in tenant.list_timelines() {
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
// collect per-timeline metrics only for active timelines
|
||||
if timeline.is_active() {
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
match TimelineSnapshot::collect(&timeline, ctx) {
|
||||
Ok(Some(snap)) => {
|
||||
snap.to_metrics(
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
Utc::now(),
|
||||
&mut current_metrics,
|
||||
cached_metrics,
|
||||
);
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: WRITTEN_SIZE,
|
||||
},
|
||||
timeline_written_size,
|
||||
));
|
||||
|
||||
let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
|
||||
match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
Ok((size, is_exact)) if is_exact => {
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: TIMELINE_LOGICAL_SIZE,
|
||||
},
|
||||
size,
|
||||
));
|
||||
}
|
||||
Ok((_, _)) => {}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"failed to get current logical size for timeline {}: {err:?}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
tenant_resident_size += timeline.resident_physical_size();
|
||||
let timeline_resident_size = timeline.get_resident_physical_size();
|
||||
tenant_resident_size += timeline_resident_size;
|
||||
}
|
||||
|
||||
current_metrics
|
||||
.push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
|
||||
match tenant.get_remote_size().await {
|
||||
Ok(tenant_remote_size) => {
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: REMOTE_STORAGE_SIZE,
|
||||
},
|
||||
tenant_remote_size,
|
||||
));
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"failed to get remote size for tenant {}: {err:?}",
|
||||
tenant_id
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
current_metrics
|
||||
.push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: RESIDENT_SIZE,
|
||||
},
|
||||
tenant_resident_size,
|
||||
));
|
||||
|
||||
// Note that this metric is calculated in a separate bgworker
|
||||
// Here we only use cached value, which may lag behind the real latest one
|
||||
let synthetic_size = tenant.cached_synthetic_size();
|
||||
let tenant_synthetic_size = tenant.get_cached_synthetic_size();
|
||||
|
||||
if synthetic_size != 0 {
|
||||
if tenant_synthetic_size != 0 {
|
||||
// only send non-zeroes because otherwise these show up as errors in logs
|
||||
current_metrics
|
||||
.push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: SYNTHETIC_STORAGE_SIZE,
|
||||
},
|
||||
tenant_synthetic_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Filter metrics, unless we want to send all metrics, including cached ones.
|
||||
// See: https://github.com/neondatabase/neon/issues/3485
|
||||
if !send_cached {
|
||||
current_metrics.retain(|(curr_key, (kind, curr_val))| {
|
||||
if kind.is_incremental() {
|
||||
// incremental values (currently only written_size_delta) should not get any cache
|
||||
// deduplication because they will be used by upstream for "is still alive."
|
||||
true
|
||||
} else {
|
||||
match cached_metrics.get(curr_key) {
|
||||
Some((_, val)) => val != curr_val,
|
||||
None => true,
|
||||
}
|
||||
}
|
||||
current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
|
||||
Some(val) => val != curr_val,
|
||||
None => true,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -362,16 +268,14 @@ async fn collect_metrics_iteration(
|
||||
|
||||
let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);
|
||||
|
||||
let node_id = node_id.to_string();
|
||||
|
||||
for chunk in chunks {
|
||||
chunk_to_send.clear();
|
||||
|
||||
// enrich metrics with type,timestamp and idempotency key before sending
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
|
||||
kind: *when,
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
|
||||
kind: EventType::Absolute { time: Utc::now() },
|
||||
metric: curr_key.metric,
|
||||
idempotency_key: idempotency_key(&node_id),
|
||||
idempotency_key: idempotency_key(node_id.to_string()),
|
||||
value: *curr_val,
|
||||
extra: Ids {
|
||||
tenant_id: curr_key.tenant_id,
|
||||
@@ -379,14 +283,17 @@ async fn collect_metrics_iteration(
|
||||
},
|
||||
}));
|
||||
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
|
||||
events: &chunk_to_send,
|
||||
})
|
||||
.expect("PageserverConsumptionMetric should not fail serialization");
|
||||
|
||||
const MAX_RETRIES: u32 = 3;
|
||||
|
||||
for attempt in 0..MAX_RETRIES {
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&EventChunk {
|
||||
events: (&chunk_to_send).into(),
|
||||
})
|
||||
.json(&chunk_json)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
@@ -422,130 +329,6 @@ async fn collect_metrics_iteration(
|
||||
}
|
||||
}
|
||||
|
||||
/// Internal type to make timeline metric production testable.
|
||||
///
|
||||
/// As this value type contains all of the information needed from a timeline to produce the
|
||||
/// metrics, it can easily be created with different values in test.
|
||||
struct TimelineSnapshot {
|
||||
loaded_at: (Lsn, SystemTime),
|
||||
last_record_lsn: Lsn,
|
||||
current_exact_logical_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl TimelineSnapshot {
|
||||
/// Collect the metrics from an actual timeline.
|
||||
///
|
||||
/// Fails currently only when [`Timeline::get_current_logical_size`] fails.
|
||||
///
|
||||
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
|
||||
fn collect(
|
||||
t: &Arc<crate::tenant::Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<Self>> {
|
||||
use anyhow::Context;
|
||||
|
||||
if !t.is_active() {
|
||||
// no collection for broken or stopping needed, we will still keep the cached values
|
||||
// though at the caller.
|
||||
Ok(None)
|
||||
} else {
|
||||
let loaded_at = t.loaded_at;
|
||||
let last_record_lsn = t.get_last_record_lsn();
|
||||
|
||||
let current_exact_logical_size = {
|
||||
let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
|
||||
let res = span
|
||||
.in_scope(|| t.get_current_logical_size(ctx))
|
||||
.context("get_current_logical_size");
|
||||
match res? {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
(size, is_exact) if is_exact => Some(size),
|
||||
(_, _) => None,
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(TimelineSnapshot {
|
||||
loaded_at,
|
||||
last_record_lsn,
|
||||
current_exact_logical_size,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
/// Produce the timeline consumption metrics into the `metrics` argument.
|
||||
fn to_metrics(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
|
||||
cache: &HashMap<MetricsKey, (EventType, u64)>,
|
||||
) {
|
||||
let timeline_written_size = u64::from(self.last_record_lsn);
|
||||
|
||||
let (key, written_size_now) =
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
|
||||
|
||||
// last_record_lsn can only go up, right now at least, TODO: #2592 or related
|
||||
// features might change this.
|
||||
|
||||
let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
|
||||
|
||||
// use this when available, because in a stream of incremental values, it will be
|
||||
// accurate where as when last_record_lsn stops moving, we will only cache the last
|
||||
// one of those.
|
||||
let last_stop_time = cache
|
||||
.get(written_size_delta_key.key())
|
||||
.map(|(until, _val)| {
|
||||
until
|
||||
.incremental_timerange()
|
||||
.expect("never create EventType::Absolute for written_size_delta")
|
||||
.end
|
||||
});
|
||||
|
||||
// by default, use the last sent written_size as the basis for
|
||||
// calculating the delta. if we don't yet have one, use the load time value.
|
||||
let prev = cache
|
||||
.get(&key)
|
||||
.map(|(prev_at, prev)| {
|
||||
// use the prev time from our last incremental update, or default to latest
|
||||
// absolute update on the first round.
|
||||
let prev_at = prev_at
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let prev_at = last_stop_time.unwrap_or(prev_at);
|
||||
(*prev_at, *prev)
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
// if we don't have a previous point of comparison, compare to the load time
|
||||
// lsn.
|
||||
let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
|
||||
(DateTime::from(*loaded_at), disk_consistent_lsn.0)
|
||||
});
|
||||
|
||||
// written_size_bytes_delta
|
||||
metrics.extend(
|
||||
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
|
||||
let up_to = written_size_now
|
||||
.0
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
|
||||
Some(key_value)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
|
||||
// written_size
|
||||
metrics.push((key, written_size_now));
|
||||
|
||||
if let Some(size) = self.current_exact_logical_size {
|
||||
metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Caclculate synthetic size for each active tenant
|
||||
pub async fn calculate_synthetic_size_worker(
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
@@ -560,7 +343,7 @@ pub async fn calculate_synthetic_size_worker(
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
return Ok(());
|
||||
},
|
||||
tick_at = ticker.tick() => {
|
||||
tick_at = ticker.tick() => {
|
||||
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
@@ -596,149 +379,3 @@ pub async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use std::time::SystemTime;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::consumption_metrics::MetricsKey;
|
||||
|
||||
use super::TimelineSnapshot;
|
||||
use chrono::{DateTime, Utc};
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_before_advancing() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::new();
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, SystemTime::now()),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
let now = DateTime::<Utc>::from(SystemTime::now());
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
snap.loaded_at.1.into(),
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_second_round() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let [now, before, init] = time_backwards();
|
||||
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
let before = DateTime::<Utc>::from(before);
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_previous_up_to(before, now, 0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let [now, just_before, before, init] = time_backwards();
|
||||
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
let just_before = DateTime::<Utc>::from(just_before);
|
||||
let before = DateTime::<Utc>::from(before);
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
// at t=before was the last time the last_record_lsn changed
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
|
||||
// end time of this event is used for the next ones
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
before,
|
||||
just_before,
|
||||
0,
|
||||
),
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
just_before,
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
|
||||
let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
|
||||
times[0] = std::time::SystemTime::now();
|
||||
for behind in 1..N {
|
||||
times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
|
||||
}
|
||||
|
||||
times
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,9 +179,6 @@ impl RequestContext {
|
||||
/// a context and you are unwilling to change all callers to provide one.
|
||||
///
|
||||
/// Before we add cancellation, we should get rid of this method.
|
||||
///
|
||||
/// [`attached_child`]: Self::attached_child
|
||||
/// [`detached_child`]: Self::detached_child
|
||||
pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
Self::new(task_kind, download_behavior)
|
||||
}
|
||||
|
||||
@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
|
||||
tenant::{self, storage_layer::PersistentLayer, Timeline},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -166,11 +166,11 @@ async fn disk_usage_eviction_task(
|
||||
.await;
|
||||
|
||||
let sleep_until = start + task_config.period;
|
||||
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep_until(sleep_until) => {},
|
||||
_ = cancel.cancelled() => {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -304,18 +304,17 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// Debug-log the list of candidates
|
||||
let now = SystemTime::now();
|
||||
for (i, (partition, candidate)) in candidates.iter().enumerate() {
|
||||
let desc = candidate.layer.layer_desc();
|
||||
debug!(
|
||||
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
|
||||
i + 1,
|
||||
candidates.len(),
|
||||
desc.file_size,
|
||||
candidate.layer.file_size(),
|
||||
now.duration_since(candidate.last_activity_ts)
|
||||
.unwrap()
|
||||
.as_micros(),
|
||||
partition,
|
||||
desc.tenant_id,
|
||||
desc.timeline_id,
|
||||
candidate.layer.get_tenant_id(),
|
||||
candidate.layer.get_timeline_id(),
|
||||
candidate.layer,
|
||||
);
|
||||
}
|
||||
@@ -347,7 +346,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
warned = Some(usage_planned);
|
||||
}
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
|
||||
usage_planned.add_available_bytes(candidate.layer.file_size());
|
||||
|
||||
batched
|
||||
.entry(TimelineKey(candidate.timeline))
|
||||
@@ -390,31 +389,25 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
Ok(results) => {
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
match result {
|
||||
Some(Ok(())) => {
|
||||
usage_assumed.add_available_bytes(file_size);
|
||||
Some(Ok(true)) => {
|
||||
usage_assumed.add_available_bytes(layer.file_size());
|
||||
}
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
|
||||
}
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
Some(Err(
|
||||
e @ EvictionError::LayerNotFound(_)
|
||||
| e @ EvictionError::StatFailed(_),
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(%layer, "failed to evict layer: {e}");
|
||||
evictions_failed.file_sizes += file_size;
|
||||
Some(Ok(false)) => {
|
||||
// this is:
|
||||
// - Replacement::{NotFound, Unexpected}
|
||||
// - it cannot be is_remote_layer, filtered already
|
||||
evictions_failed.file_sizes += layer.file_size();
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
None => {
|
||||
assert!(cancel.is_cancelled());
|
||||
return;
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
// we really shouldn't be getting this, precondition failure
|
||||
error!("failed to evict layer: {:#}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -547,12 +540,12 @@ async fn collect_eviction_candidates(
|
||||
// We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
|
||||
// That's what's typically used by the various background loops.
|
||||
//
|
||||
// The default can be overridden with a fixed value in the tenant conf.
|
||||
// The default can be overriden with a fixed value in the tenant conf.
|
||||
// A default override can be put in the default tenant conf in the pageserver.toml.
|
||||
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
overridden_size=s,
|
||||
overriden_size=s,
|
||||
"using overridden min resident size for tenant"
|
||||
);
|
||||
s
|
||||
|
||||
@@ -994,29 +994,31 @@ async fn timeline_gc_handler(
|
||||
// Run compaction immediately on given timeline.
|
||||
async fn timeline_compact_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
timeline
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
|
||||
.await
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
|
||||
.await
|
||||
.context("spawn compaction task")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let result: anyhow::Result<()> = result_receiver
|
||||
.await
|
||||
.context("receive compaction result")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
result.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
async fn timeline_checkpoint_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -1029,13 +1031,13 @@ async fn timeline_checkpoint_handler(
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact(&cancel, &ctx)
|
||||
.compact(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
|
||||
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -1141,7 +1143,7 @@ async fn disk_usage_eviction_run(
|
||||
let Some(storage) = state.remote_storage.clone() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)));
|
||||
)))
|
||||
};
|
||||
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
pub mod metrics;
|
||||
pub(crate) mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
@@ -47,50 +47,24 @@ pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
#[tracing::instrument]
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
use std::time::Duration;
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
|
||||
"shutdown LibpqEndpointListener",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
|
||||
|
||||
// Shut down any page service tasks.
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
|
||||
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
timed(
|
||||
tenant::mgr::shutdown_all_tenants(),
|
||||
"shutdown all tenants",
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
tenant::mgr::shutdown_all_tenants().await;
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
|
||||
"shutdown http",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
|
||||
|
||||
// There should be nothing left, but let's be sure
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(None, None, None),
|
||||
"shutdown leftovers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
task_mgr::shutdown_tasks(None, None, None).await;
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
@@ -135,8 +109,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
|
||||
@@ -151,30 +123,15 @@ pub fn is_temporary(path: &Path) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
|
||||
pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
match path.file_name() {
|
||||
Some(name) => name.to_string_lossy().ends_with(suffix),
|
||||
Some(name) => name
|
||||
.to_string_lossy()
|
||||
.ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
pub fn is_delete_mark(path: &Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
|
||||
if let Some(e) = e.io_error() {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
|
||||
/// blocking.
|
||||
///
|
||||
@@ -198,45 +155,6 @@ pub struct InitializationOrder {
|
||||
pub background_jobs_can_start: utils::completion::Barrier,
|
||||
}
|
||||
|
||||
/// Time the future with a warning when it exceeds a threshold.
|
||||
async fn timed<Fut: std::future::Future>(
|
||||
fut: Fut,
|
||||
name: &str,
|
||||
warn_at: std::time::Duration,
|
||||
) -> <Fut as std::future::Future>::Output {
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
match tokio::time::timeout(warn_at, &mut fut).await {
|
||||
Ok(ret) => {
|
||||
tracing::info!(
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed"
|
||||
);
|
||||
ret
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"still waiting, taking longer than expected..."
|
||||
);
|
||||
|
||||
let ret = fut.await;
|
||||
|
||||
tracing::warn!(
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed, took longer than expected"
|
||||
);
|
||||
|
||||
ret
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod backoff_defaults_tests {
|
||||
use super::*;
|
||||
@@ -267,36 +185,3 @@ mod backoff_defaults_tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod timed_tests {
|
||||
use super::timed;
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
async fn timed_completes_when_inner_future_completes() {
|
||||
// A future that completes on time should have its result returned
|
||||
let r1 = timed(
|
||||
async move {
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
123
|
||||
},
|
||||
"test 1",
|
||||
Duration::from_millis(50),
|
||||
)
|
||||
.await;
|
||||
assert_eq!(r1, 123);
|
||||
|
||||
// A future that completes too slowly should also have its result returned
|
||||
let r1 = timed(
|
||||
async move {
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
456
|
||||
},
|
||||
"test 1",
|
||||
Duration::from_millis(10),
|
||||
)
|
||||
.await;
|
||||
assert_eq!(r1, 456);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
|
||||
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
|
||||
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
|
||||
register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
|
||||
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::TenantState;
|
||||
use strum::VariantNames;
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -73,7 +74,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_storage_operations_seconds_global",
|
||||
"Time spent on storage operations",
|
||||
@@ -83,17 +84,18 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_read_num_fs_layers",
|
||||
"Number of persistent layers accessed for processing a read request, including those in the cache",
|
||||
&["tenant_id", "timeline_id"],
|
||||
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value (reconstruct a page from deltas)",
|
||||
@@ -102,7 +104,7 @@ pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"Number of cache hits from materialized page cache without redo",
|
||||
@@ -110,16 +112,17 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_getpage_get_reconstruct_data_seconds",
|
||||
"Time spent in get_reconstruct_value_data",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"Number of cache hits from materialized page cache",
|
||||
@@ -243,10 +246,11 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
|
||||
},
|
||||
});
|
||||
|
||||
pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_wait_lsn_seconds",
|
||||
"Time spent waiting for WAL to arrive",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -280,7 +284,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_remote_ondemand_downloaded_layers_total",
|
||||
"Total on-demand downloaded layers"
|
||||
@@ -288,7 +292,7 @@ pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::ne
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_remote_ondemand_downloaded_bytes_total",
|
||||
"Total bytes of layers on-demand downloaded",
|
||||
@@ -305,29 +309,16 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define current logical size metric")
|
||||
});
|
||||
|
||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_states_count",
|
||||
"Count of tenants per state",
|
||||
&["state"]
|
||||
&["tenant_id", "state"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
/// A set of broken tenants.
|
||||
///
|
||||
/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
|
||||
/// tenant.
|
||||
pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_broken_tenants_count",
|
||||
"Set of broken tenants",
|
||||
&["tenant_id"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_synthetic_cached_size_bytes",
|
||||
"Synthetic size of each tenant in bytes",
|
||||
@@ -385,7 +376,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_unexpected_ondemand_downloads_count",
|
||||
"Number of unexpected on-demand downloads. \
|
||||
@@ -394,36 +385,7 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// How long did we take to start up? Broken down by labels to describe
|
||||
/// different phases of startup.
|
||||
pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
|
||||
register_gauge_vec!(
|
||||
"pageserver_startup_duration_seconds",
|
||||
"Time taken by phases of pageserver startup, in seconds",
|
||||
&["phase"]
|
||||
)
|
||||
.expect("Failed to register pageserver_startup_duration_seconds metric")
|
||||
});
|
||||
|
||||
pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_startup_is_loading",
|
||||
"1 while in initial startup load of tenants, 0 at other times"
|
||||
)
|
||||
.expect("Failed to register pageserver_startup_is_loading")
|
||||
});
|
||||
|
||||
/// How long did tenants take to go from construction to active state?
|
||||
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_tenant_activation_seconds",
|
||||
"Time taken by tenants to activate, in seconds",
|
||||
CRITICAL_OP_BUCKETS.into()
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_activation_seconds metric")
|
||||
});
|
||||
|
||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
data_source: &'static str,
|
||||
@@ -537,31 +499,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
30.000, // 30000 ms
|
||||
];
|
||||
|
||||
/// Tracks time taken by fs operations near VirtualFile.
|
||||
///
|
||||
/// Operations:
|
||||
/// - open ([`std::fs::OpenOptions::open`])
|
||||
/// - close (dropping [`std::fs::File`])
|
||||
/// - close-by-replace (close by replacement algorithm)
|
||||
/// - read (`read_at`)
|
||||
/// - write (`write_at`)
|
||||
/// - seek (modify internal position or file length query)
|
||||
/// - fsync ([`std::fs::File::sync_all`])
|
||||
/// - metadata ([`std::fs::File::metadata`])
|
||||
pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
|
||||
"open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
|
||||
];
|
||||
|
||||
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
|
||||
|
||||
pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_io_operations_seconds",
|
||||
"Time spent in IO operations",
|
||||
&["operation"],
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
STORAGE_IO_TIME_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
|
||||
|
||||
// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
|
||||
pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"Total amount of bytes read/written in IO operations",
|
||||
@@ -587,17 +541,6 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
// Go code uses milliseconds. Variable is called `computeStartupBuckets`
|
||||
[
|
||||
5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
|
||||
1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
|
||||
]
|
||||
.map(|ms| (ms as f64) / 1000.0)
|
||||
});
|
||||
|
||||
pub struct BasebackupQueryTime(HistogramVec);
|
||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
BasebackupQueryTime({
|
||||
@@ -605,7 +548,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
"pageserver_basebackup_query_seconds",
|
||||
"Histogram of basebackup queries durations, by result type",
|
||||
&["result"],
|
||||
COMPUTE_STARTUP_BUCKETS.to_vec(),
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
})
|
||||
@@ -651,7 +594,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
|
||||
at a given instant. It gives you a better idea of the queue depth \
|
||||
than plotting the gauge directly, since operations may complete faster \
|
||||
than the sampling interval.",
|
||||
&["file_kind", "op_kind"],
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
|
||||
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
|
||||
)
|
||||
@@ -708,18 +651,18 @@ impl RemoteOpFileKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_operation_seconds",
|
||||
"Time spent on remote storage operations. \
|
||||
Grouped by tenant, timeline, operation_kind and status. \
|
||||
Does not account for time spent waiting in remote timeline client's queues.",
|
||||
&["file_kind", "op_kind", "status"]
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_task_events",
|
||||
"Number of task start/stop/fail events.",
|
||||
@@ -728,7 +671,7 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_period_overrun_count",
|
||||
"Incremented whenever warn_when_period_overrun() logs a warning.",
|
||||
@@ -739,7 +682,7 @@ pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = La
|
||||
|
||||
// walreceiver metrics
|
||||
|
||||
pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_walreceiver_started_connections_total",
|
||||
"Number of started walreceiver connections"
|
||||
@@ -747,7 +690,7 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_walreceiver_active_managers",
|
||||
"Number of active walreceiver managers"
|
||||
@@ -755,7 +698,7 @@ pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_walreceiver_switches_total",
|
||||
"Number of walreceiver manager change_connection calls",
|
||||
@@ -764,7 +707,7 @@ pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_walreceiver_broker_updates_total",
|
||||
"Number of received broker updates in walreceiver"
|
||||
@@ -772,7 +715,7 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_walreceiver_candidates_events_total",
|
||||
"Number of walreceiver candidate events",
|
||||
@@ -781,10 +724,10 @@ pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
|
||||
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
|
||||
|
||||
// Metrics collected on WAL redo operations
|
||||
@@ -831,7 +774,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
|
||||
};
|
||||
}
|
||||
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_seconds",
|
||||
"Time spent on WAL redo",
|
||||
@@ -840,7 +783,7 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"Time spent waiting for access to the Postgres WAL redo process",
|
||||
@@ -849,7 +792,7 @@ pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_records_histogram",
|
||||
"Histogram of number of records replayed per redo in the Postgres WAL redo process",
|
||||
@@ -858,7 +801,7 @@ pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_bytes_histogram",
|
||||
"Histogram of number of records replayed per redo sent to Postgres",
|
||||
@@ -867,8 +810,7 @@ pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
|
||||
pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_replayed_wal_records_total",
|
||||
"Number of WAL records replayed in WAL redo process"
|
||||
@@ -876,7 +818,7 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
|
||||
pub struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
start: Instant,
|
||||
@@ -934,7 +876,7 @@ impl StorageTimeMetrics {
|
||||
|
||||
/// Starts timing a new operation.
|
||||
///
|
||||
/// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
|
||||
/// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
|
||||
pub fn start_timer(&self) -> StorageTimeMetricsTimer {
|
||||
StorageTimeMetricsTimer::new(self.clone())
|
||||
}
|
||||
@@ -944,6 +886,7 @@ impl StorageTimeMetrics {
|
||||
pub struct TimelineMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
pub get_reconstruct_data_time_histo: Histogram,
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
@@ -952,7 +895,9 @@ pub struct TimelineMetrics {
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub read_num_fs_layers: Histogram,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
@@ -969,6 +914,9 @@ impl TimelineMetrics {
|
||||
) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let flush_time_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
|
||||
let compact_time_histo =
|
||||
@@ -989,6 +937,9 @@ impl TimelineMetrics {
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let wait_lsn_time_histo = WAIT_LSN_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -1004,12 +955,16 @@ impl TimelineMetrics {
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let read_num_fs_layers = READ_NUM_FS_LAYERS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration =
|
||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
get_reconstruct_data_time_histo,
|
||||
flush_time_histo,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
@@ -1018,6 +973,7 @@ impl TimelineMetrics {
|
||||
garbage_collect_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
wait_lsn_time_histo,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
num_persistent_files_created,
|
||||
@@ -1026,6 +982,7 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
read_num_fs_layers,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1034,12 +991,15 @@ impl Drop for TimelineMetrics {
|
||||
fn drop(&mut self) {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -1051,6 +1011,9 @@ impl Drop for TimelineMetrics {
|
||||
let _ =
|
||||
STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
for op in STORAGE_IO_TIME_OPERATIONS {
|
||||
let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in STORAGE_IO_SIZE_OPERATIONS {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
@@ -1065,7 +1028,9 @@ impl Drop for TimelineMetrics {
|
||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
let tid = tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
// we leave the BROKEN_TENANTS_SET entry if any
|
||||
for state in TenantState::VARIANTS {
|
||||
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
|
||||
}
|
||||
}
|
||||
|
||||
use futures::Future;
|
||||
@@ -1080,7 +1045,9 @@ pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
|
||||
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
|
||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
}
|
||||
@@ -1090,13 +1057,14 @@ impl RemoteTimelineClientMetrics {
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_id.to_string(),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
remote_operation_time: Mutex::new(HashMap::default()),
|
||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||
calls_started_hist: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
|
||||
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
|
||||
guard
|
||||
@@ -1110,17 +1078,26 @@ impl RemoteTimelineClientMetrics {
|
||||
})
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub fn remote_operation_time(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
status: &'static str,
|
||||
) -> Histogram {
|
||||
let mut guard = self.remote_operation_time.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str(), status);
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[key.0, key.1, key.2])
|
||||
.unwrap()
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
key.2,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
|
||||
fn calls_unfinished_gauge(
|
||||
@@ -1148,10 +1125,19 @@ impl RemoteTimelineClientMetrics {
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Histogram {
|
||||
let mut guard = self.calls_started_hist.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[key.0, key.1])
|
||||
.unwrap()
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
}
|
||||
|
||||
fn bytes_started_counter(
|
||||
@@ -1270,7 +1256,7 @@ impl RemoteTimelineClientMetrics {
|
||||
/// Update the metrics that change when a call to the remote timeline client instance starts.
|
||||
///
|
||||
/// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
|
||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
|
||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
|
||||
/// is more suitable.
|
||||
/// Never do both.
|
||||
pub(crate) fn call_begin(
|
||||
@@ -1303,7 +1289,7 @@ impl RemoteTimelineClientMetrics {
|
||||
|
||||
/// Manually udpate the metrics that track completions, instead of using the guard object.
|
||||
/// Using the guard object is generally preferable.
|
||||
/// See [`call_begin`](Self::call_begin) for more context.
|
||||
/// See [`call_begin`] for more context.
|
||||
pub(crate) fn call_end(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
@@ -1331,10 +1317,15 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
remote_physical_size_gauge,
|
||||
remote_operation_time,
|
||||
calls_unfinished_gauge,
|
||||
calls_started_hist,
|
||||
bytes_started_counter,
|
||||
bytes_finished_counter,
|
||||
} = self;
|
||||
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
|
||||
}
|
||||
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
||||
tenant_id,
|
||||
@@ -1343,6 +1334,14 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
||||
tenant_id,
|
||||
@@ -1424,51 +1423,15 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
// FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
|
||||
// order:
|
||||
// - global metrics reside in a Lazy<PageserverMetrics>
|
||||
// - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
|
||||
// - could move the statics into TimelineMetrics::new()?
|
||||
// We want to alert on this metric increasing.
|
||||
// Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
|
||||
assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
|
||||
UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
|
||||
|
||||
// counters
|
||||
[
|
||||
&MATERIALIZED_PAGE_CACHE_HIT,
|
||||
&MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
&UNEXPECTED_ONDEMAND_DOWNLOADS,
|
||||
&WALRECEIVER_STARTED_CONNECTIONS,
|
||||
&WALRECEIVER_BROKER_UPDATES,
|
||||
&WALRECEIVER_CANDIDATES_ADDED,
|
||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
|
||||
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
|
||||
// gauges
|
||||
WALRECEIVER_ACTIVE_MANAGERS.get();
|
||||
|
||||
// histograms
|
||||
[
|
||||
&READ_NUM_FS_LAYERS,
|
||||
&RECONSTRUCT_TIME,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
&WAL_REDO_WAIT_TIME,
|
||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||
&WAL_REDO_BYTES_HISTOGRAM,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|h| {
|
||||
Lazy::force(h);
|
||||
});
|
||||
// Python tests need these.
|
||||
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
|
||||
MATERIALIZED_PAGE_CACHE_HIT.get();
|
||||
}
|
||||
|
||||
@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// context, breaking the atomicity is OK. If the import is interrupted, the
|
||||
/// whole import fails and the timeline will be deleted anyway.
|
||||
/// (Or to be precise, it will be left behind for debugging purposes and
|
||||
/// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
|
||||
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
|
||||
///
|
||||
/// Note: A consequence of flushing the pending operations is that they
|
||||
/// won't be visible to subsequent operations until `commit`. The function
|
||||
|
||||
@@ -130,25 +130,11 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("background op worker")
|
||||
// if you change the number of worker threads please change the constant below
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create background op runtime")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
|
||||
// force init and thus panics
|
||||
let _ = BACKGROUND_RUNTIME.handle();
|
||||
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
|
||||
// tokio would had already panicked for parsing errors or NotUnicode
|
||||
//
|
||||
// this will be wrong if any of the runtimes gets their worker threads configured to something
|
||||
// else, but that has not been needed in a long time.
|
||||
std::env::var("TOKIO_WORKER_THREADS")
|
||||
.map(|s| s.parse::<usize>().unwrap())
|
||||
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
@@ -219,7 +205,7 @@ pub enum TaskKind {
|
||||
///
|
||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||
/// That abstraction doesn't use `task_mgr`.
|
||||
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
||||
/// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
|
||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
||||
///
|
||||
/// Once the connection is established, the `TaskHandle` task creates a
|
||||
@@ -227,21 +213,16 @@ pub enum TaskKind {
|
||||
/// the `Connection` object.
|
||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||
///
|
||||
/// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler
|
||||
/// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller
|
||||
WalReceiverManager,
|
||||
|
||||
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
||||
/// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
|
||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
///
|
||||
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
||||
WalReceiverConnectionHandler,
|
||||
|
||||
/// The task that polls the `tokio-postgres::Connection` object.
|
||||
/// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler).
|
||||
/// See the comment on [`WalReceiverManager`](Self::WalReceiverManager).
|
||||
/// Spawned by task [`WalReceiverConnectionHandler`].
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
WalReceiverConnectionPoller,
|
||||
|
||||
// Garbage collection worker. One per tenant
|
||||
@@ -525,13 +506,17 @@ pub async fn shutdown_tasks(
|
||||
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
}
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
let join_handle = tokio::select! {
|
||||
biased;
|
||||
_ = &mut join_handle => { None },
|
||||
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
Some(join_handle)
|
||||
}
|
||||
};
|
||||
if let Some(join_handle) = join_handle {
|
||||
// we never handled this return value, but:
|
||||
// - we don't deschedule which would lead to is_cancelled
|
||||
// - panics are already logged (is_panicked)
|
||||
@@ -559,7 +544,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
|
||||
pub async fn shutdown_watcher() {
|
||||
let token = SHUTDOWN_TOKEN
|
||||
.try_with(|t| t.clone())
|
||||
.expect("shutdown_watcher() called in an unexpected task or thread");
|
||||
.expect("shutdown_requested() called in an unexpected task or thread");
|
||||
|
||||
token.cancelled().await;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -16,20 +16,30 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
impl<R> BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
/// For reading
|
||||
pub trait BlobCursor {
|
||||
/// Read a blob into a new buffer.
|
||||
pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_blob_into_buf(offset, &mut buf)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
/// Read blob into the given buffer. Any previous contents in the buffer
|
||||
/// are overwritten.
|
||||
pub fn read_blob_into_buf(
|
||||
&self,
|
||||
fn read_blob_into_buf(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error>;
|
||||
}
|
||||
|
||||
impl<R> BlobCursor for BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
fn read_blob_into_buf(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
|
||||
@@ -80,7 +80,7 @@ where
|
||||
BlockCursor { reader }
|
||||
}
|
||||
|
||||
pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
|
||||
pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
|
||||
self.reader.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -230,15 +230,14 @@ where
|
||||
///
|
||||
/// Read the value for given key. Returns the value, or None if it doesn't exist.
|
||||
///
|
||||
pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
|
||||
pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
|
||||
let mut result: Option<u64> = None;
|
||||
self.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
if key == search_key {
|
||||
result = Some(value);
|
||||
}
|
||||
false
|
||||
})
|
||||
.await?;
|
||||
})?;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
@@ -247,7 +246,7 @@ where
|
||||
/// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
|
||||
/// backwards)
|
||||
///
|
||||
pub async fn visit<V>(
|
||||
pub fn visit<V>(
|
||||
&self,
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
@@ -270,9 +269,23 @@ where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
// Locate the node.
|
||||
let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
|
||||
let blk = self.reader.read_blk(self.start_blk + node_blknum)?;
|
||||
|
||||
let node = OnDiskNode::deparse(node_buf.as_ref())?;
|
||||
// Search all entries on this node
|
||||
self.search_node(blk.as_ref(), search_key, dir, visitor)
|
||||
}
|
||||
|
||||
fn search_node<V>(
|
||||
&self,
|
||||
node_buf: &[u8],
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
let node = OnDiskNode::deparse(node_buf)?;
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
@@ -377,42 +390,39 @@ where
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub async fn dump(&self) -> Result<()> {
|
||||
let mut stack = Vec::new();
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
self.dump_recurse(self.root_blk, &[], 0)
|
||||
}
|
||||
|
||||
stack.push((self.root_blk, String::new(), 0, 0, 0));
|
||||
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
|
||||
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
|
||||
if child_idx == 0 {
|
||||
print!("{:indent$}", "", indent = depth * 2);
|
||||
let path_prefix = stack
|
||||
.iter()
|
||||
.map(|(_blknum, path, ..)| path.as_str())
|
||||
.collect::<String>();
|
||||
println!(
|
||||
"blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
|
||||
hex::encode(node.prefix),
|
||||
node.suffix_len
|
||||
);
|
||||
}
|
||||
print!("{:indent$}", "", indent = depth * 2);
|
||||
println!(
|
||||
"blk #{}: path {}: prefix {}, suffix_len {}",
|
||||
blknum,
|
||||
hex::encode(path),
|
||||
hex::encode(node.prefix),
|
||||
node.suffix_len
|
||||
);
|
||||
|
||||
if child_idx + 1 < node.num_children {
|
||||
let key_off = key_off + node.suffix_len as usize;
|
||||
stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
|
||||
}
|
||||
let mut idx = 0;
|
||||
let mut key_off = 0;
|
||||
while idx < node.num_children {
|
||||
let key = &node.keys[key_off..key_off + node.suffix_len as usize];
|
||||
let val = node.value(child_idx as usize);
|
||||
|
||||
let val = node.value(idx as usize);
|
||||
print!("{:indent$}", "", indent = depth * 2 + 2);
|
||||
println!("{}: {}", hex::encode(key), hex::encode(val.0));
|
||||
|
||||
if node.level > 0 {
|
||||
stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
|
||||
let child_path = [path, node.prefix].concat();
|
||||
self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
|
||||
}
|
||||
idx += 1;
|
||||
key_off += node.suffix_len as usize;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -432,7 +442,7 @@ where
|
||||
writer: W,
|
||||
|
||||
///
|
||||
/// `stack[0]` is the current root page, `stack.last()` is the leaf.
|
||||
/// stack[0] is the current root page, stack.last() is the leaf.
|
||||
///
|
||||
/// We maintain the length of the stack to be always greater than zero.
|
||||
/// Two exceptions are:
|
||||
@@ -744,8 +754,8 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn basic() -> Result<()> {
|
||||
#[test]
|
||||
fn basic() -> Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
|
||||
|
||||
@@ -765,16 +775,16 @@ mod tests {
|
||||
|
||||
let reader = DiskBtreeReader::new(0, root_offset, disk);
|
||||
|
||||
reader.dump().await?;
|
||||
reader.dump()?;
|
||||
|
||||
// Test the `get` function on all the keys.
|
||||
for (key, val) in all_data.iter() {
|
||||
assert_eq!(reader.get(key).await?, Some(*val));
|
||||
assert_eq!(reader.get(key)?, Some(*val));
|
||||
}
|
||||
// And on some keys that don't exist
|
||||
assert_eq!(reader.get(b"aaaaaa").await?, None);
|
||||
assert_eq!(reader.get(b"zzzzzz").await?, None);
|
||||
assert_eq!(reader.get(b"xaaabx").await?, None);
|
||||
assert_eq!(reader.get(b"aaaaaa")?, None);
|
||||
assert_eq!(reader.get(b"zzzzzz")?, None);
|
||||
assert_eq!(reader.get(b"xaaabx")?, None);
|
||||
|
||||
// Test search with `visit` function
|
||||
let search_key = b"xabaaa";
|
||||
@@ -785,12 +795,10 @@ mod tests {
|
||||
.collect();
|
||||
|
||||
let mut data = Vec::new();
|
||||
reader
|
||||
.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
reader.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})?;
|
||||
assert_eq!(data, expected);
|
||||
|
||||
// Test a backwards scan
|
||||
@@ -801,20 +809,16 @@ mod tests {
|
||||
.collect();
|
||||
expected.reverse();
|
||||
let mut data = Vec::new();
|
||||
reader
|
||||
.visit(search_key, VisitDirection::Backwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
reader.visit(search_key, VisitDirection::Backwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})?;
|
||||
assert_eq!(data, expected);
|
||||
|
||||
// Backward scan where nothing matches
|
||||
reader
|
||||
.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
|
||||
panic!("found unexpected key {}: {}", hex::encode(key), value);
|
||||
})
|
||||
.await?;
|
||||
reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
|
||||
panic!("found unexpected key {}: {}", hex::encode(key), value);
|
||||
})?;
|
||||
|
||||
// Full scan
|
||||
let expected: Vec<(Vec<u8>, u64)> = all_data
|
||||
@@ -822,19 +826,17 @@ mod tests {
|
||||
.map(|(key, value)| (key.to_vec(), *value))
|
||||
.collect();
|
||||
let mut data = Vec::new();
|
||||
reader
|
||||
.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
|
||||
data.push((key.to_vec(), value));
|
||||
true
|
||||
})?;
|
||||
assert_eq!(data, expected);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn lots_of_keys() -> Result<()> {
|
||||
#[test]
|
||||
fn lots_of_keys() -> Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
|
||||
|
||||
@@ -854,7 +856,7 @@ mod tests {
|
||||
|
||||
let reader = DiskBtreeReader::new(0, root_offset, disk);
|
||||
|
||||
reader.dump().await?;
|
||||
reader.dump()?;
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
@@ -875,15 +877,13 @@ mod tests {
|
||||
for search_key_int in 0..(NUM_KEYS * 2 + 10) {
|
||||
let search_key = u64::to_be_bytes(search_key_int);
|
||||
assert_eq!(
|
||||
reader.get(&search_key).await?,
|
||||
reader.get(&search_key)?,
|
||||
all_data.get(&search_key_int).cloned()
|
||||
);
|
||||
|
||||
// Test a forward scan starting with this key
|
||||
result.lock().unwrap().clear();
|
||||
reader
|
||||
.visit(&search_key, VisitDirection::Forwards, take_ten)
|
||||
.await?;
|
||||
reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
|
||||
let expected = all_data
|
||||
.range(search_key_int..)
|
||||
.take(10)
|
||||
@@ -893,9 +893,7 @@ mod tests {
|
||||
|
||||
// And a backwards scan
|
||||
result.lock().unwrap().clear();
|
||||
reader
|
||||
.visit(&search_key, VisitDirection::Backwards, take_ten)
|
||||
.await?;
|
||||
reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
|
||||
let expected = all_data
|
||||
.range(..=search_key_int)
|
||||
.rev()
|
||||
@@ -909,9 +907,7 @@ mod tests {
|
||||
let search_key = u64::to_be_bytes(0);
|
||||
limit.store(usize::MAX, Ordering::Relaxed);
|
||||
result.lock().unwrap().clear();
|
||||
reader
|
||||
.visit(&search_key, VisitDirection::Forwards, take_ten)
|
||||
.await?;
|
||||
reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
|
||||
let expected = all_data
|
||||
.iter()
|
||||
.map(|(&key, &val)| (key, val))
|
||||
@@ -922,9 +918,7 @@ mod tests {
|
||||
let search_key = u64::to_be_bytes(u64::MAX);
|
||||
limit.store(usize::MAX, Ordering::Relaxed);
|
||||
result.lock().unwrap().clear();
|
||||
reader
|
||||
.visit(&search_key, VisitDirection::Backwards, take_ten)
|
||||
.await?;
|
||||
reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
|
||||
let expected = all_data
|
||||
.iter()
|
||||
.rev()
|
||||
@@ -935,8 +929,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn random_data() -> Result<()> {
|
||||
#[test]
|
||||
fn random_data() -> Result<()> {
|
||||
// Generate random keys with exponential distribution, to
|
||||
// exercise the prefix compression
|
||||
const NUM_KEYS: usize = 100000;
|
||||
@@ -963,23 +957,19 @@ mod tests {
|
||||
// Test get() operation on all the keys
|
||||
for (&key, &val) in all_data.iter() {
|
||||
let search_key = u128::to_be_bytes(key);
|
||||
assert_eq!(reader.get(&search_key).await?, Some(val));
|
||||
assert_eq!(reader.get(&search_key)?, Some(val));
|
||||
}
|
||||
|
||||
// Test get() operations on random keys, most of which will not exist
|
||||
for _ in 0..100000 {
|
||||
let key_int = rand::thread_rng().gen::<u128>();
|
||||
let search_key = u128::to_be_bytes(key_int);
|
||||
assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
|
||||
assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
|
||||
}
|
||||
|
||||
// Test boundary cases
|
||||
assert!(
|
||||
reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
|
||||
);
|
||||
assert!(
|
||||
reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
|
||||
);
|
||||
assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
|
||||
assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1004,8 +994,8 @@ mod tests {
|
||||
///
|
||||
/// This test contains a particular data set, see disk_btree_test_data.rs
|
||||
///
|
||||
#[tokio::test]
|
||||
async fn particular_data() -> Result<()> {
|
||||
#[test]
|
||||
fn particular_data() -> Result<()> {
|
||||
// Build a tree from it
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
|
||||
@@ -1021,20 +1011,18 @@ mod tests {
|
||||
|
||||
// Test get() operation on all the keys
|
||||
for (key, val) in disk_btree_test_data::TEST_DATA {
|
||||
assert_eq!(reader.get(&key).await?, Some(val));
|
||||
assert_eq!(reader.get(&key)?, Some(val));
|
||||
}
|
||||
|
||||
// Test full scan
|
||||
let mut count = 0;
|
||||
reader
|
||||
.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
|
||||
count += 1;
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
|
||||
count += 1;
|
||||
true
|
||||
})?;
|
||||
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
|
||||
|
||||
reader.dump().await?;
|
||||
reader.dump()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -266,17 +266,11 @@ impl Drop for EphemeralFile {
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.file.path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.file.path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
warn!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.file.path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -334,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use rand::{seq::SliceRandom, thread_rng, RngCore};
|
||||
use std::fs;
|
||||
@@ -426,7 +420,7 @@ mod tests {
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let cursor = BlockCursor::new(&file);
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
for (pos, expected) in blobs {
|
||||
let actual = cursor.read_blob(pos)?;
|
||||
assert_eq!(actual, expected);
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
//! Other read methods are less critical but still impact performance of background tasks.
|
||||
//!
|
||||
//! This data structure relies on a persistent/immutable binary search tree. See the
|
||||
//! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
|
||||
//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
||||
//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
|
||||
//! you to modify the tree in such a way that each modification creates a new "version"
|
||||
//! of the tree. When you modify it, you get a new version, but all previous versions are
|
||||
@@ -40,7 +40,7 @@
|
||||
//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
|
||||
//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
|
||||
//! to throw away most of the persistent BST and build a new one, starting from the oldest
|
||||
//! LSN. See [`LayerMap::flush_updates()`].
|
||||
//! LSN. See `LayerMap::flush_updates()`.
|
||||
//!
|
||||
|
||||
mod historic_layer_coverage;
|
||||
@@ -121,7 +121,7 @@ impl BatchedUpdates<'_> {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
|
||||
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
self.layer_map.remove_historic_noflush(layer_desc)
|
||||
}
|
||||
|
||||
@@ -253,11 +253,11 @@ impl LayerMap {
|
||||
///
|
||||
/// Helper function for BatchedUpdates::remove_historic
|
||||
///
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
|
||||
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
|
||||
self.historic
|
||||
.remove(historic_layer_coverage::LayerKey::from(layer_desc));
|
||||
.remove(historic_layer_coverage::LayerKey::from(&layer_desc));
|
||||
let layer_key = layer_desc.key();
|
||||
if Self::is_l0(layer_desc) {
|
||||
if Self::is_l0(&layer_desc) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
|
||||
l0_delta_layers.retain(|other| other.key() != layer_key);
|
||||
@@ -626,17 +626,17 @@ impl LayerMap {
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
|
||||
println!("open_layer:");
|
||||
if let Some(open_layer) = &self.open_layer {
|
||||
open_layer.dump(verbose, ctx).await?;
|
||||
open_layer.dump(verbose, ctx)?;
|
||||
}
|
||||
|
||||
println!("frozen_layers:");
|
||||
for frozen_layer in self.frozen_layers.iter() {
|
||||
frozen_layer.dump(verbose, ctx).await?;
|
||||
frozen_layer.dump(verbose, ctx)?;
|
||||
}
|
||||
|
||||
println!("historic_layers:");
|
||||
@@ -766,7 +766,8 @@ mod tests {
|
||||
expected_in_counts
|
||||
);
|
||||
|
||||
map.batch_update().remove_historic(downloaded.layer_desc());
|
||||
map.batch_update()
|
||||
.remove_historic(downloaded.layer_desc().clone());
|
||||
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
|
||||
}
|
||||
|
||||
|
||||
@@ -122,7 +122,8 @@ impl<Value: Clone> HistoricLayerCoverage<Value> {
|
||||
self.head = self
|
||||
.historic
|
||||
.iter()
|
||||
.next_back()
|
||||
.rev()
|
||||
.next()
|
||||
.map(|(_, v)| v.clone())
|
||||
.unwrap_or_default();
|
||||
}
|
||||
@@ -411,7 +412,7 @@ fn test_persistent_overlapping() {
|
||||
/// still be more critical.
|
||||
///
|
||||
/// See this for more on persistent and retroactive techniques:
|
||||
/// <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
|
||||
/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
||||
pub struct BufferedHistoricLayerCoverage<Value> {
|
||||
/// A persistent layer map that we rebuild when we need to retroactively update
|
||||
historic_coverage: HistoricLayerCoverage<Value>,
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::ops::Range;
|
||||
|
||||
// NOTE the `im` crate has 20x more downloads and also has
|
||||
// persistent/immutable BTree. But it's bugged so rpds is a
|
||||
// better choice <https://github.com/neondatabase/neon/issues/3395>
|
||||
// better choice https://github.com/neondatabase/neon/issues/3395
|
||||
use rpds::RedBlackTreeMapSync;
|
||||
|
||||
/// Data structure that can efficiently:
|
||||
@@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync;
|
||||
/// - insert layers in non-decreasing lsn.start order
|
||||
///
|
||||
/// For a detailed explanation and justification of this approach, see:
|
||||
/// <https://neon.tech/blog/persistent-structures-in-neons-wal-indexing>
|
||||
/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
|
||||
///
|
||||
/// NOTE The struct is parameterized over Value for easier
|
||||
/// testing, but in practice it's some sort of layer.
|
||||
@@ -113,7 +113,8 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
pub fn query(&self, key: i128) -> Option<Value> {
|
||||
self.nodes
|
||||
.range(..=key)
|
||||
.next_back()?
|
||||
.rev()
|
||||
.next()?
|
||||
.1
|
||||
.as_ref()
|
||||
.map(|(_, v)| v.clone())
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
//! Currently, this is not used in the system. Future refactors will ensure
|
||||
//! the storage state will be recorded in this file, and the system can be
|
||||
//! recovered from this file. This is tracked in
|
||||
//! <https://github.com/neondatabase/neon/issues/4418>
|
||||
//! https://github.com/neondatabase/neon/issues/4418
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
|
||||
@@ -1,19 +1,16 @@
|
||||
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Later, the file gets used in [`remote_timeline_client`] as a part of
|
||||
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
|
||||
//! external storage import and export operations.
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{self, Write};
|
||||
use std::io::Write;
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
use tracing::info_span;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::{
|
||||
@@ -268,24 +265,24 @@ pub fn save_metadata(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum LoadMetadataError {
|
||||
#[error(transparent)]
|
||||
Read(#[from] io::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
Decode(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Result<TimelineMetadata, LoadMetadataError> {
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
|
||||
let metadata_bytes = std::fs::read(metadata_path)?;
|
||||
|
||||
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
||||
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})?;
|
||||
TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -26,8 +26,6 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::timeline::delete::DeleteTimelineFlow;
|
||||
|
||||
/// The tenants known to the pageserver.
|
||||
/// The enum variants are used to distinguish the different states that the pageserver can be in.
|
||||
enum TenantsMap {
|
||||
@@ -235,17 +233,11 @@ pub fn schedule_local_tenant_processing(
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument(skip_all)]
|
||||
#[instrument]
|
||||
pub async fn shutdown_all_tenants() {
|
||||
shutdown_all_tenants0(&TENANTS).await
|
||||
}
|
||||
|
||||
async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
|
||||
use utils::completion;
|
||||
|
||||
// Prevent new tenants from being created.
|
||||
let tenants_to_shut_down = {
|
||||
let mut m = tenants.write().await;
|
||||
let mut m = TENANTS.write().await;
|
||||
match &mut *m {
|
||||
TenantsMap::Initializing => {
|
||||
*m = TenantsMap::ShuttingDown(HashMap::default());
|
||||
@@ -266,77 +258,44 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
let mut join_set = JoinSet::new();
|
||||
for (tenant_id, tenant) in tenants_to_shut_down {
|
||||
join_set.spawn(
|
||||
async move {
|
||||
let freeze_and_flush = true;
|
||||
|
||||
let res = {
|
||||
let (_guard, shutdown_progress) = completion::channel();
|
||||
tenant.shutdown(shutdown_progress, freeze_and_flush).await
|
||||
};
|
||||
|
||||
if let Err(other_progress) = res {
|
||||
// join the another shutdown in progress
|
||||
other_progress.wait().await;
|
||||
match tenant.shutdown(freeze_and_flush).await {
|
||||
Ok(()) => debug!("tenant successfully stopped"),
|
||||
Err(super::ShutdownError::AlreadyStopping) => {
|
||||
warn!("tenant was already shutting down")
|
||||
}
|
||||
}
|
||||
|
||||
// we cannot afford per tenant logging here, because if s3 is degraded, we are
|
||||
// going to log too many lines
|
||||
|
||||
debug!("tenant successfully stopped");
|
||||
}
|
||||
.instrument(info_span!("shutdown", %tenant_id)),
|
||||
);
|
||||
}
|
||||
|
||||
let total = join_set.len();
|
||||
let mut panicked = 0;
|
||||
let mut buffering = true;
|
||||
const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
|
||||
let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));
|
||||
|
||||
while !join_set.is_empty() {
|
||||
tokio::select! {
|
||||
Some(joined) = join_set.join_next() => {
|
||||
match joined {
|
||||
Ok(()) => {}
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures");
|
||||
}
|
||||
Err(join_error) if join_error.is_panic() => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
panicked += 1;
|
||||
}
|
||||
Err(join_error) => {
|
||||
warn!("unknown kind of JoinError: {join_error}");
|
||||
}
|
||||
}
|
||||
if !buffering {
|
||||
// buffer so that every 500ms since the first update (or starting) we'll log
|
||||
// how far away we are; this is because we will get SIGKILL'd at 10s, and we
|
||||
// are not able to log *then*.
|
||||
buffering = true;
|
||||
buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
|
||||
}
|
||||
},
|
||||
_ = &mut buffered, if buffering => {
|
||||
buffering = false;
|
||||
info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
|
||||
while let Some(res) = join_set.join_next().await {
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(join_error) if join_error.is_cancelled() => {
|
||||
unreachable!("we are not cancelling any of the futures");
|
||||
}
|
||||
Err(join_error) if join_error.is_panic() => {
|
||||
// cannot really do anything, as this panic is likely a bug
|
||||
panicked += 1;
|
||||
}
|
||||
Err(join_error) => {
|
||||
warn!("unknown kind of JoinError: {join_error}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if panicked > 0 {
|
||||
warn!(
|
||||
panicked,
|
||||
total, "observed panicks while shutting down tenants"
|
||||
);
|
||||
warn!(panicked, "observed panicks while shutting down tenants");
|
||||
}
|
||||
|
||||
// caller will log how long we took
|
||||
}
|
||||
|
||||
pub async fn create_tenant(
|
||||
@@ -429,10 +388,12 @@ pub enum DeleteTimelineError {
|
||||
pub async fn delete_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
_ctx: &RequestContext,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
DeleteTimelineFlow::run(&tenant, timeline_id).await?;
|
||||
tenant
|
||||
.prepare_and_schedule_delete_timeline(timeline_id, ctx)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -452,15 +413,6 @@ pub async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
detach_ignored: bool,
|
||||
) -> Result<(), TenantStateError> {
|
||||
detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
|
||||
}
|
||||
|
||||
async fn detach_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
detach_ignored: bool,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||
@@ -473,8 +425,7 @@ async fn detach_tenant0(
|
||||
};
|
||||
|
||||
let removal_result =
|
||||
remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
|
||||
.await;
|
||||
remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
|
||||
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
@@ -521,15 +472,7 @@ pub async fn ignore_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
ignore_tenant0(conf, &TENANTS, tenant_id).await
|
||||
}
|
||||
|
||||
async fn ignore_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
remove_tenant_from_memory(tenants, tenant_id, async {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
fs::File::create(&ignore_mark_file)
|
||||
.await
|
||||
@@ -654,21 +597,18 @@ where
|
||||
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
|
||||
/// operation would be needed to remove it.
|
||||
async fn remove_tenant_from_memory<V, F>(
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
tenant_cleanup: F,
|
||||
) -> Result<V, TenantStateError>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
use utils::completion;
|
||||
|
||||
// It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
|
||||
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
||||
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
||||
// avoid holding the lock for the entire process.
|
||||
let tenant = {
|
||||
tenants
|
||||
TENANTS
|
||||
.write()
|
||||
.await
|
||||
.get(&tenant_id)
|
||||
@@ -676,20 +616,14 @@ where
|
||||
.ok_or(TenantStateError::NotFound(tenant_id))?
|
||||
};
|
||||
|
||||
// allow pageserver shutdown to await for our completion
|
||||
let (_guard, progress) = completion::channel();
|
||||
|
||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||
let freeze_and_flush = false;
|
||||
|
||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||
// that we can continue safely to cleanup.
|
||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
||||
match tenant.shutdown(freeze_and_flush).await {
|
||||
Ok(()) => {}
|
||||
Err(_other) => {
|
||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||
// wait for it but return an error right away because these are distinct requests.
|
||||
return Err(TenantStateError::IsStopping(tenant_id));
|
||||
Err(super::ShutdownError::AlreadyStopping) => {
|
||||
return Err(TenantStateError::IsStopping(tenant_id))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -698,14 +632,14 @@ where
|
||||
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
||||
{
|
||||
Ok(hook_value) => {
|
||||
let mut tenants_accessor = tenants.write().await;
|
||||
let mut tenants_accessor = TENANTS.write().await;
|
||||
if tenants_accessor.remove(&tenant_id).is_none() {
|
||||
warn!("Tenant {tenant_id} got removed from memory before operation finished");
|
||||
}
|
||||
Ok(hook_value)
|
||||
}
|
||||
Err(e) => {
|
||||
let tenants_accessor = tenants.read().await;
|
||||
let tenants_accessor = TENANTS.read().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => {
|
||||
tenant.set_broken(e.to_string()).await;
|
||||
@@ -774,108 +708,51 @@ pub async fn immediate_gc(
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{info_span, Instrument};
|
||||
pub async fn immediate_compact(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
|
||||
let guard = TENANTS.read().await;
|
||||
|
||||
use super::{super::harness::TenantHarness, TenantsMap};
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn shutdown_joins_remove_tenant_from_memory() {
|
||||
// the test is a bit ugly with the lockstep together with spawned tasks. the aim is to make
|
||||
// sure `shutdown_all_tenants0` per-tenant processing joins in any active
|
||||
// remove_tenant_from_memory calls, which is enforced by making the operation last until
|
||||
// we've ran `shutdown_all_tenants0` for a long time.
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let (t, _ctx) = TenantHarness::create("shutdown_joins_detach")
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::Compaction,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!(
|
||||
"timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
|
||||
),
|
||||
false,
|
||||
async move {
|
||||
let result = timeline
|
||||
.compact(&ctx)
|
||||
.instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
|
||||
.await;
|
||||
|
||||
// harness loads it to active, which is forced and nothing is running on the tenant
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send compaction result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
let id = t.tenant_id();
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
|
||||
// tenant harness configures the logging and we cannot escape it
|
||||
let _e = info_span!("testing", tenant_id = %id).entered();
|
||||
|
||||
let tenants = HashMap::from([(id, t.clone())]);
|
||||
let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
|
||||
|
||||
let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
|
||||
let (until_cleanup_started, cleanup_started) = utils::completion::channel();
|
||||
|
||||
// start a "detaching operation", which will take a while, until can_complete_cleanup
|
||||
let cleanup_task = {
|
||||
let jh = tokio::spawn({
|
||||
let tenants = tenants.clone();
|
||||
async move {
|
||||
let cleanup = async move {
|
||||
drop(until_cleanup_started);
|
||||
can_complete_cleanup.wait().await;
|
||||
anyhow::Ok(())
|
||||
};
|
||||
super::remove_tenant_from_memory(&tenants, id, cleanup).await
|
||||
}
|
||||
.instrument(info_span!("foobar", tenant_id = %id))
|
||||
});
|
||||
|
||||
// now the long cleanup should be in place, with the stopping state
|
||||
cleanup_started.wait().await;
|
||||
jh
|
||||
};
|
||||
|
||||
let mut cleanup_progress = std::pin::pin!(t
|
||||
.shutdown(utils::completion::Barrier::default(), false)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.wait());
|
||||
|
||||
let mut shutdown_task = {
|
||||
let (until_shutdown_started, shutdown_started) = utils::completion::channel();
|
||||
|
||||
let shutdown_task = tokio::spawn(async move {
|
||||
drop(until_shutdown_started);
|
||||
super::shutdown_all_tenants0(&tenants).await;
|
||||
});
|
||||
|
||||
shutdown_started.wait().await;
|
||||
shutdown_task
|
||||
};
|
||||
|
||||
// if the joining in is removed from shutdown_all_tenants0, the shutdown_task should always
|
||||
// get to complete within timeout and fail the test. it is expected to continue awaiting
|
||||
// until completion or SIGKILL during normal shutdown.
|
||||
//
|
||||
// the timeout is long to cover anything that shutdown_task could be doing, but it is
|
||||
// handled instantly because we use tokio's time pausing in this test. 100s is much more than
|
||||
// what we get from systemd on shutdown (10s).
|
||||
let long_time = std::time::Duration::from_secs(100);
|
||||
tokio::select! {
|
||||
_ = &mut shutdown_task => unreachable!("shutdown must continue, until_cleanup_completed is not dropped"),
|
||||
_ = &mut cleanup_progress => unreachable!("cleanup progress must continue, until_cleanup_completed is not dropped"),
|
||||
_ = tokio::time::sleep(long_time) => {},
|
||||
}
|
||||
|
||||
// allow the remove_tenant_from_memory and thus eventually the shutdown to continue
|
||||
drop(until_cleanup_completed);
|
||||
|
||||
let (je, ()) = tokio::join!(shutdown_task, cleanup_progress);
|
||||
je.expect("Tenant::shutdown shutdown not have panicked");
|
||||
cleanup_task
|
||||
.await
|
||||
.expect("no panicking")
|
||||
.expect("remove_tenant_from_memory failed");
|
||||
|
||||
futures::future::poll_immediate(
|
||||
t.shutdown(utils::completion::Barrier::default(), false)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.wait(),
|
||||
)
|
||||
.await
|
||||
.expect("the stopping progress must still be complete");
|
||||
}
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
@@ -135,7 +135,7 @@
|
||||
//! - Initiate upload queue with that [`IndexPart`].
|
||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||
//! and remote state as per [`IndexPart`]. This is done in
|
||||
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//! [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//!
|
||||
//! Note that if we crash during file deletion between the index update
|
||||
//! that removes the file from the list of files, and deleting the remote file,
|
||||
@@ -163,8 +163,8 @@
|
||||
//! - download their remote [`IndexPart`]s
|
||||
//! - create `Timeline` struct and a `RemoteTimelineClient`
|
||||
//! - initialize the client's upload queue with its `IndexPart`
|
||||
//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
|
||||
//! for layers that are referenced by `IndexPart` but not present locally
|
||||
//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
|
||||
//! but not present locally
|
||||
//! - schedule uploads for layers that are only present locally.
|
||||
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
||||
//! the local filesystem, write the remote metadata to the local filesystem
|
||||
@@ -198,8 +198,6 @@
|
||||
//! in remote storage.
|
||||
//! But note that we don't test any of this right now.
|
||||
//!
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
|
||||
|
||||
mod delete;
|
||||
mod download;
|
||||
@@ -514,7 +512,7 @@ impl RemoteTimelineClient {
|
||||
/// updated metadata.
|
||||
///
|
||||
/// The upload will be added to the queue immediately, but it
|
||||
/// won't be performed until all previously scheduled layer file
|
||||
/// won't be performed until all previosuly scheduled layer file
|
||||
/// upload operations have completed successfully. This is to
|
||||
/// ensure that when the index file claims that layers X, Y and Z
|
||||
/// exist in remote storage, they really do. To wait for the upload
|
||||
@@ -625,7 +623,7 @@ impl RemoteTimelineClient {
|
||||
/// Note: This schedules an index file upload before the deletions. The
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// successfully.
|
||||
/// succesfully.
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
@@ -827,7 +825,7 @@ impl RemoteTimelineClient {
|
||||
)
|
||||
};
|
||||
|
||||
receiver.changed().await.context("upload queue shut down")?;
|
||||
receiver.changed().await?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
@@ -842,37 +840,23 @@ impl RemoteTimelineClient {
|
||||
let remaining: Vec<RemotePath> = remaining
|
||||
.into_iter()
|
||||
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
|
||||
.inspect(|path| {
|
||||
if let Some(name) = path.object_name() {
|
||||
info!(%name, "deleting a file not referenced from index_part.json");
|
||||
} else {
|
||||
warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json");
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !remaining.is_empty() {
|
||||
warn!(
|
||||
"Found {} files not bound to index_file.json, proceeding with their deletion",
|
||||
remaining.len()
|
||||
);
|
||||
warn!("About to remove {} files", remaining.len());
|
||||
self.storage_impl.delete_objects(&remaining).await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-delete"
|
||||
))?
|
||||
});
|
||||
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("deleting index part");
|
||||
self.storage_impl.delete(&index_file_path).await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-after-index-delete"
|
||||
))?
|
||||
});
|
||||
|
||||
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
|
||||
info!(deletions_queued, "done deleting, including index_part.json");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1117,7 +1101,7 @@ impl RemoteTimelineClient {
|
||||
debug!("remote task {} completed successfully", task.op);
|
||||
}
|
||||
|
||||
// The task has completed successfully. Remove it from the in-progress list.
|
||||
// The task has completed succesfully. Remove it from the in-progress list.
|
||||
{
|
||||
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match upload_queue_guard.deref_mut() {
|
||||
|
||||
@@ -223,45 +223,6 @@ mod tests {
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v2_indexpart_is_parsed_with_deleted_at() {
|
||||
let example = r#"{
|
||||
"version":2,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["This shouldn't fail deserialization"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"deleted_at": "2023-07-31T09:00:00.123"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 2,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_layers_are_parsed() {
|
||||
let empty_layers_json = r#"{
|
||||
|
||||
@@ -62,11 +62,12 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
let source_file = match source_file_res {
|
||||
Ok(source_file) => source_file,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// If we encounter this arm, it wasn't intended, but it's also not
|
||||
// a big problem, if it's because the file was deleted before an
|
||||
// upload. However, a nonexistent file can also be indicative of
|
||||
// something worse, like when a file is scheduled for upload before
|
||||
// it has been written to disk yet.
|
||||
// In some situations we might run into the underlying file being deleted by
|
||||
// e.g. compaction before the uploader gets to it. In that instance, we don't
|
||||
// want to retry the error: a deleted file won't come back. In theory, the
|
||||
// file might not have been written in the first place, which also indicates
|
||||
// a bug. Still log the situation so that we can keep an eye on it.
|
||||
// See https://github.com/neondatabase/neon/issues/4526
|
||||
info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -110,11 +110,11 @@ pub struct TimelineInputs {
|
||||
///
|
||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||
/// is updated on-demand, during the start of this calculation and separate from the
|
||||
/// [`TimelineInputs::latest_gc_cutoff`].
|
||||
/// [`Timeline::latest_gc_cutoff`].
|
||||
///
|
||||
/// For timelines in general:
|
||||
///
|
||||
/// ```text
|
||||
/// ```ignore
|
||||
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
||||
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
||||
/// ```
|
||||
|
||||
@@ -11,7 +11,10 @@ pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<
|
||||
#[cfg(debug_assertions)]
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {
|
||||
if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
|
||||
panic!("missing extractors: {missing:?}")
|
||||
if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
|
||||
panic!(
|
||||
"missing extractors: {:?}",
|
||||
missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ mod remote_layer;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
@@ -34,7 +34,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
pub use inmemory_layer::InMemoryLayer;
|
||||
@@ -162,9 +162,6 @@ impl LayerAccessStats {
|
||||
/// The caller is responsible for recording a residence event
|
||||
/// using [`record_residence_event`] before calling `latest_activity`.
|
||||
/// If they don't, [`latest_activity`] will return `None`.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
/// [`latest_activity`]: Self::latest_activity
|
||||
pub(crate) fn empty_will_record_residence_event_later() -> Self {
|
||||
LayerAccessStats(Mutex::default())
|
||||
}
|
||||
@@ -172,9 +169,6 @@ impl LayerAccessStats {
|
||||
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
///
|
||||
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn for_loading_layer(
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
status: LayerResidenceStatus,
|
||||
@@ -193,8 +187,6 @@ impl LayerAccessStats {
|
||||
/// The `new_status` is not recorded in `self`.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn clone_for_residence_change(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
@@ -302,13 +294,11 @@ impl LayerAccessStats {
|
||||
/// implementation error. This function logs a rate-limited warning in that case.
|
||||
///
|
||||
/// TODO: use type system to avoid the need for `fallback`.
|
||||
/// The approach in <https://github.com/neondatabase/neon/pull/3775>
|
||||
/// The approach in https://github.com/neondatabase/neon/pull/3775
|
||||
/// could be used to enforce that a residence event is recorded
|
||||
/// before a layer is added to the layer map. We could also have
|
||||
/// a layer wrapper type that holds the LayerAccessStats, and ensure
|
||||
/// that that type can only be produced by inserting into the layer map.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
@@ -333,13 +323,12 @@ impl LayerAccessStats {
|
||||
}
|
||||
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
/// required by [`LayerMap`](super::layer_map::LayerMap).
|
||||
/// required by [`LayerMap`].
|
||||
///
|
||||
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
||||
/// timeline names, because those are known in the context of which the layers
|
||||
/// are used in (timeline).
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
@@ -369,7 +358,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
async fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -378,9 +367,15 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Returned by [`Layer::iter`]
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
|
||||
|
||||
/// Returned by [`Layer::key_iter`]
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
pub trait AsLayerDesc {
|
||||
/// Get the layer descriptor.
|
||||
@@ -401,6 +396,16 @@ pub trait AsLayerDesc {
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN.
|
||||
pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
/// Identify the tenant this layer belongs to
|
||||
fn get_tenant_id(&self) -> TenantId {
|
||||
self.layer_desc().tenant_id
|
||||
}
|
||||
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> TimelineId {
|
||||
self.layer_desc().timeline_id
|
||||
}
|
||||
|
||||
/// File name used for this layer, both in the pageserver's local filesystem
|
||||
/// state as well as in the remote storage.
|
||||
fn filename(&self) -> LayerFileName {
|
||||
@@ -411,6 +416,15 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
// `None` for `RemoteLayer`.
|
||||
fn local_path(&self) -> Option<PathBuf>;
|
||||
|
||||
/// Iterate through all keys and values stored in the layer
|
||||
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
|
||||
|
||||
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
|
||||
/// It is used only for compaction and so is currently implemented only for DeltaLayer
|
||||
fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete_resident_layer_file(&self) -> Result<()>;
|
||||
|
||||
@@ -418,14 +432,18 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
None
|
||||
}
|
||||
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
None
|
||||
}
|
||||
|
||||
fn is_remote_layer(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Returns None if the layer file size is not known.
|
||||
///
|
||||
/// Should not change over the lifetime of the layer object because
|
||||
/// current_physical_size is computed as the som of this value.
|
||||
fn file_size(&self) -> u64 {
|
||||
self.layer_desc().file_size
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
|
||||
|
||||
fn access_stats(&self) -> &LayerAccessStats;
|
||||
|
||||
@@ -7,18 +7,14 @@
|
||||
//! must be page images or WAL records with the 'will_init' flag set, so that
|
||||
//! they can be replayed without referring to an older page version.
|
||||
//!
|
||||
//! The delta files are stored in `timelines/<timeline_id>` directory. Currently,
|
||||
//! The delta files are stored in timelines/<timeline_id> directory. Currently,
|
||||
//! there are no subdirectories, and each delta file is named like this:
|
||||
//!
|
||||
//! ```text
|
||||
//! <key start>-<key end>__<start LSN>-<end LSN>
|
||||
//! ```
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! ```text
|
||||
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
//! ```
|
||||
//!
|
||||
//! Every delta file consists of three parts: "summary", "index", and
|
||||
//! "values". The summary is a fixed size header at the beginning of the file,
|
||||
@@ -31,7 +27,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -41,6 +37,7 @@ use crate::virtual_file::VirtualFile;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -50,8 +47,6 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -61,8 +56,8 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
|
||||
PersistentLayerDesc,
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -90,30 +85,14 @@ pub struct Summary {
|
||||
|
||||
impl From<&DeltaLayer> for Summary {
|
||||
fn from(layer: &DeltaLayer) -> Self {
|
||||
Self::expected(
|
||||
layer.desc.tenant_id,
|
||||
layer.desc.timeline_id,
|
||||
layer.desc.key_range.clone(),
|
||||
layer.desc.lsn_range.clone(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Summary {
|
||||
pub(super) fn expected(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
keys: Range<Key>,
|
||||
lsns: Range<Lsn>,
|
||||
) -> Self {
|
||||
Self {
|
||||
magic: DELTA_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
key_range: keys,
|
||||
lsn_range: lsns,
|
||||
tenant_id: layer.desc.tenant_id,
|
||||
timeline_id: layer.desc.timeline_id,
|
||||
key_range: layer.desc.key_range.clone(),
|
||||
lsn_range: layer.desc.lsn_range.clone(),
|
||||
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
@@ -124,10 +103,12 @@ impl Summary {
|
||||
// Flag indicating that this version initialize the page
|
||||
const WILL_INIT: u64 = 1;
|
||||
|
||||
///
|
||||
/// Struct representing reference to BLOB in layers. Reference contains BLOB
|
||||
/// offset, and for WAL records it also contains `will_init` flag. The flag
|
||||
/// helps to determine the range of records that needs to be applied, without
|
||||
/// reading/deserializing records themselves.
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
||||
pub struct BlobRef(pub u64);
|
||||
|
||||
@@ -152,8 +133,10 @@ impl BlobRef {
|
||||
pub const DELTA_KEY_SIZE: usize = KEY_SIZE + 8;
|
||||
struct DeltaKey([u8; DELTA_KEY_SIZE]);
|
||||
|
||||
///
|
||||
/// This is the key of the B-tree index stored in the delta layer. It consists
|
||||
/// of the serialized representation of a Key and LSN.
|
||||
///
|
||||
impl DeltaKey {
|
||||
fn from_slice(buf: &[u8]) -> Self {
|
||||
let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE];
|
||||
@@ -201,7 +184,7 @@ pub struct DeltaLayer {
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: OnceCell<Arc<DeltaLayerInner>>,
|
||||
inner: OnceCell<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayer {
|
||||
@@ -226,12 +209,6 @@ pub struct DeltaLayerInner {
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
}
|
||||
|
||||
impl AsRef<DeltaLayerInner> for DeltaLayerInner {
|
||||
fn as_ref(&self) -> &DeltaLayerInner {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("DeltaLayerInner")
|
||||
@@ -241,10 +218,9 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -260,7 +236,7 @@ impl Layer for DeltaLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx)?;
|
||||
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
@@ -274,12 +250,12 @@ impl Layer for DeltaLayer {
|
||||
file,
|
||||
);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
tree_reader.dump()?;
|
||||
|
||||
let cursor = file.block_cursor();
|
||||
let mut cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
@@ -299,29 +275,27 @@ impl Layer for DeltaLayer {
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, val| {
|
||||
let blob_ref = BlobRef(val);
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|delta_key, val| {
|
||||
let blob_ref = BlobRef(val);
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -329,16 +303,82 @@ impl Layer for DeltaLayer {
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.desc.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!(self.desc.key_range.contains(&key));
|
||||
|
||||
let inner = self
|
||||
.load(LayerAccessKind::GetValueReconstructData, ctx)
|
||||
.await?;
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
inner
|
||||
.get_value_reconstruct_data(key, lsn_range, reconstruct_state)
|
||||
.await
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
|
||||
tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
})?;
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// release metadata lock and close the file
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
@@ -370,14 +410,27 @@ impl AsLayerDesc for DeltaLayer {
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.context("load delta layer")?;
|
||||
Ok(match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
Err(err) => Box::new(std::iter::once(Err(err))),
|
||||
})
|
||||
}
|
||||
|
||||
fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
|
||||
Ok(Box::new(
|
||||
DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
@@ -447,44 +500,55 @@ impl DeltaLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&Arc<DeltaLayerInner>> {
|
||||
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.await
|
||||
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
|
||||
}
|
||||
|
||||
async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
|
||||
fn load_inner(&self) -> Result<DeltaLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let summary = match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => Some(Summary::from(self)),
|
||||
PathOrConf::Path(_) => None,
|
||||
};
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, summary)?;
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
if let PathOrConf::Path(ref path) = self.path_or_conf {
|
||||
// not production code
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
let mut expected_summary = Summary::from(self);
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
if actual_summary != expected_summary {
|
||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!("warning: filename does not match what is expected from in-file summary");
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
"warning: filename does not match what is expected from in-file summary"
|
||||
);
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Arc::new(loaded))
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
Ok(DeltaLayerInner {
|
||||
file,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
@@ -506,7 +570,7 @@ impl DeltaLayer {
|
||||
file_size,
|
||||
),
|
||||
access_stats,
|
||||
inner: OnceCell::new(),
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -533,7 +597,7 @@ impl DeltaLayer {
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: OnceCell::new(),
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -549,33 +613,6 @@ impl DeltaLayer {
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Obtains all keys and value references stored in the layer
|
||||
///
|
||||
/// The value can be obtained via the [`ValueRef::load`] function.
|
||||
pub async fn load_val_refs(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<(Key, Lsn, ValueRef<Arc<DeltaLayerInner>>)>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::Iter, ctx)
|
||||
.await
|
||||
.context("load delta layer")?;
|
||||
DeltaLayerInner::load_val_refs(inner)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
}
|
||||
|
||||
/// Loads all keys stored in the layer. Returns key, lsn and value size.
|
||||
pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.await
|
||||
.context("load delta layer keys")?;
|
||||
DeltaLayerInner::load_keys(inner)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new delta layer.
|
||||
@@ -724,7 +761,7 @@ impl DeltaLayerWriterInner {
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: OnceCell::new(),
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
@@ -763,7 +800,7 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
|
||||
@@ -846,201 +883,168 @@ impl Drop for DeltaLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
if let Some(mut expected_summary) = summary {
|
||||
// production code path
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
if actual_summary != expected_summary {
|
||||
bail!(
|
||||
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
actual_summary,
|
||||
expected_summary
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(DeltaLayerInner {
|
||||
file,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
|
||||
tree_reader
|
||||
.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn load_val_refs<T: AsRef<DeltaLayerInner> + Clone>(
|
||||
this: &T,
|
||||
) -> Result<Vec<(Key, Lsn, ValueRef<T>)>> {
|
||||
let dl = this.as_ref();
|
||||
let file = &dl.file;
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
|
||||
|
||||
let mut all_offsets = Vec::<(Key, Lsn, ValueRef<T>)>::new();
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let val_ref = ValueRef {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(Adapter(this.clone())),
|
||||
};
|
||||
all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(all_offsets)
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
if last.0 == delta_key.key() {
|
||||
return true;
|
||||
} else {
|
||||
// subtract offset of new key BLOB and first blob of this key
|
||||
// to get total size if values associated with this key
|
||||
let first_pos = last.2;
|
||||
last.2 = pos - first_pos;
|
||||
}
|
||||
}
|
||||
all_keys.push((delta_key.key(), delta_key.lsn(), pos));
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
// Last key occupies all space till end of layer
|
||||
last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
|
||||
}
|
||||
Ok(all_keys)
|
||||
}
|
||||
///
|
||||
/// Iterator over all key-value pairse stored in a delta layer
|
||||
///
|
||||
/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
|
||||
/// That takes up quite a lot of memory. Should do this in a more streaming
|
||||
/// fashion.
|
||||
///
|
||||
struct DeltaValueIter<'a> {
|
||||
all_offsets: Vec<(DeltaKey, BlobRef)>,
|
||||
next_idx: usize,
|
||||
reader: BlockCursor<Adapter<'a>>,
|
||||
}
|
||||
|
||||
/// Reference to an on-disk value
|
||||
pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
|
||||
blob_ref: BlobRef,
|
||||
reader: BlockCursor<Adapter<T>>,
|
||||
}
|
||||
struct Adapter<'a>(&'a DeltaLayerInner);
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
|
||||
/// Loads the value from disk
|
||||
pub fn load(&self) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
Ok(val)
|
||||
}
|
||||
}
|
||||
|
||||
struct Adapter<T: AsRef<DeltaLayerInner>>(T);
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
|
||||
impl<'a> BlockReader for Adapter<'a> {
|
||||
type BlockLease = PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
self.0.as_ref().file.read_blk(blknum)
|
||||
self.0.file.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for DeltaValueIter<'a> {
|
||||
type Item = Result<(Key, Lsn, Value)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_res().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DeltaValueIter<'a> {
|
||||
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
|
||||
let iter = DeltaValueIter {
|
||||
all_offsets,
|
||||
next_idx: 0,
|
||||
reader: BlockCursor::new(Adapter(inner)),
|
||||
};
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.next_idx < self.all_offsets.len() {
|
||||
let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
|
||||
|
||||
let key = delta_key.key();
|
||||
let lsn = delta_key.lsn();
|
||||
|
||||
let buf = self.reader.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
self.next_idx += 1;
|
||||
Ok(Some((key, lsn, val)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
///
|
||||
/// Iterator over all keys stored in a delta layer
|
||||
///
|
||||
/// FIXME: This creates a Vector to hold all keys.
|
||||
/// That takes up quite a lot of memory. Should do this in a more streaming
|
||||
/// fashion.
|
||||
///
|
||||
struct DeltaKeyIter {
|
||||
all_keys: Vec<(DeltaKey, u64)>,
|
||||
next_idx: usize,
|
||||
}
|
||||
|
||||
impl Iterator for DeltaKeyIter {
|
||||
type Item = (Key, Lsn, u64);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.next_idx < self.all_keys.len() {
|
||||
let (delta_key, size) = &self.all_keys[self.next_idx];
|
||||
|
||||
let key = delta_key.key();
|
||||
let lsn = delta_key.lsn();
|
||||
|
||||
self.next_idx += 1;
|
||||
Some((key, lsn, *size))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DeltaKeyIter {
|
||||
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
if last.0.key() == delta_key.key() {
|
||||
return true;
|
||||
} else {
|
||||
// subtract offset of new key BLOB and first blob of this key
|
||||
// to get total size if values associated with this key
|
||||
let first_pos = last.1;
|
||||
last.1 = pos - first_pos;
|
||||
}
|
||||
}
|
||||
all_keys.push((delta_key, pos));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
// Last key occupies all space till end of layer
|
||||
last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
|
||||
}
|
||||
let iter = DeltaKeyIter {
|
||||
all_keys,
|
||||
next_idx: 0,
|
||||
};
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::DeltaKeyIter;
|
||||
use super::DeltaLayer;
|
||||
use super::DeltaValueIter;
|
||||
|
||||
// We will soon need the iters to be send in the compaction code.
|
||||
// Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
|
||||
// Cf https://github.com/neondatabase/neon/issues/4471
|
||||
#[test]
|
||||
fn is_send() {
|
||||
fn assert_send<T: Send>() {}
|
||||
assert_send::<DeltaLayer>();
|
||||
assert_send::<DeltaValueIter>();
|
||||
assert_send::<DeltaKeyIter>();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,9 +57,8 @@ impl Ord for DeltaFileName {
|
||||
|
||||
/// Represents the filename of a DeltaLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN start>-<LSN end>
|
||||
/// ```
|
||||
///
|
||||
impl DeltaFileName {
|
||||
///
|
||||
/// Parse a string as a delta file name. Returns None if the filename does not
|
||||
@@ -163,9 +162,7 @@ impl ImageFileName {
|
||||
///
|
||||
/// Represents the filename of an ImageLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN>
|
||||
/// ```
|
||||
impl ImageFileName {
|
||||
///
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
|
||||
@@ -7,15 +7,11 @@
|
||||
//! timelines/<timeline_id> directory. Currently, there are no
|
||||
//! subdirectories, and each image layer file is named like this:
|
||||
//!
|
||||
//! ```text
|
||||
//! <key start>-<key end>__<LSN>
|
||||
//! ```
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! ```text
|
||||
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
//! ```
|
||||
//!
|
||||
//! Every image layer file consists of three parts: "summary",
|
||||
//! "index", and "values". The summary is a fixed size header at the
|
||||
@@ -27,7 +23,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -47,7 +43,7 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tokio::sync::OnceCell;
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -57,7 +53,9 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::filename::ImageFileName;
|
||||
use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
|
||||
use super::{
|
||||
AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -66,7 +64,7 @@ use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLay
|
||||
/// the 'index' starts at the block indicated by 'index_start_blk'
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub(super) struct Summary {
|
||||
struct Summary {
|
||||
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
|
||||
magic: u16,
|
||||
format_version: u16,
|
||||
@@ -85,29 +83,13 @@ pub(super) struct Summary {
|
||||
|
||||
impl From<&ImageLayer> for Summary {
|
||||
fn from(layer: &ImageLayer) -> Self {
|
||||
Self::expected(
|
||||
layer.desc.tenant_id,
|
||||
layer.desc.timeline_id,
|
||||
layer.desc.key_range.clone(),
|
||||
layer.lsn,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Summary {
|
||||
pub(super) fn expected(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Self {
|
||||
Self {
|
||||
magic: IMAGE_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
key_range,
|
||||
lsn,
|
||||
tenant_id: layer.desc.tenant_id,
|
||||
timeline_id: layer.desc.timeline_id,
|
||||
key_range: layer.desc.key_range.clone(),
|
||||
lsn: layer.lsn,
|
||||
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
@@ -131,7 +113,7 @@ pub struct ImageLayer {
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: OnceCell<ImageLayerInner>,
|
||||
inner: RwLock<ImageLayerInner>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayer {
|
||||
@@ -148,29 +130,30 @@ impl std::fmt::Debug for ImageLayer {
|
||||
}
|
||||
|
||||
pub struct ImageLayerInner {
|
||||
/// If false, the 'index' has not been loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
// values copied from summary
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
lsn: Lsn,
|
||||
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||
file: Option<FileBlockReader<VirtualFile>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("ImageLayerInner")
|
||||
.field("loaded", &self.loaded)
|
||||
.field("index_start_blk", &self.index_start_blk)
|
||||
.field("index_root_blk", &self.index_root_blk)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -186,25 +169,23 @@ impl Layer for ImageLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
|
||||
let file = &inner.file;
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx)?;
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
tree_reader.dump().await?;
|
||||
tree_reader.dump()?;
|
||||
|
||||
tree_reader
|
||||
.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
|
||||
println!("key: {} offset {}", hex::encode(key), value);
|
||||
true
|
||||
})
|
||||
.await?;
|
||||
tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
|
||||
println!("key: {} offset {}", hex::encode(key), value);
|
||||
true
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
async fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -215,14 +196,28 @@ impl Layer for ImageLayer {
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self
|
||||
.load(LayerAccessKind::GetValueReconstructData, ctx)
|
||||
.await?;
|
||||
inner
|
||||
.get_value_reconstruct_data(key, reconstruct_state)
|
||||
.await
|
||||
// FIXME: makes no sense to dump paths
|
||||
.with_context(|| format!("read {}", self.path().display()))
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf)? {
|
||||
let blob = file.block_cursor().read_blob(offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.path().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
|
||||
@@ -259,6 +254,10 @@ impl PersistentLayer for ImageLayer {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
@@ -318,42 +317,83 @@ impl ImageLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(
|
||||
fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&ImageLayerInner> {
|
||||
) -> Result<RwLockReadGuard<ImageLayerInner>> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.await
|
||||
.with_context(|| format!("Failed to load image layer {}", self.path().display()))
|
||||
loop {
|
||||
// Quick exit if already loaded
|
||||
let inner = self.inner.read().unwrap();
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
// Need to open the file and load the metadata. Upgrade our lock to
|
||||
// a write lock. (Or rather, release and re-lock in write mode.)
|
||||
drop(inner);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(&mut inner).with_context(|| {
|
||||
format!("Failed to load image layer {}", self.path().display())
|
||||
})?
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
|
||||
// We now have the file open and loaded. There's no function to do
|
||||
// that in the std library RwLock, so we have to release and re-lock
|
||||
// in read mode. (To be precise, the lock guard was moved in the
|
||||
// above call to `load_inner`, so it's already been released). And
|
||||
// while we do that, another thread could unload again, so we have
|
||||
// to re-check and retry if that happens.
|
||||
drop(inner);
|
||||
}
|
||||
}
|
||||
|
||||
async fn load_inner(&self) -> Result<ImageLayerInner> {
|
||||
fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
|
||||
let path = self.path();
|
||||
|
||||
let expected_summary = match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => Some(Summary::from(self)),
|
||||
PathOrConf::Path(_) => None,
|
||||
};
|
||||
// Open the file if it's not open already.
|
||||
if inner.file.is_none() {
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
inner.file = Some(FileBlockReader::new(file));
|
||||
}
|
||||
let file = inner.file.as_mut().unwrap();
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
let mut expected_summary = Summary::from(self);
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
|
||||
if let PathOrConf::Path(ref path) = self.path_or_conf {
|
||||
// not production code
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
if actual_summary != expected_summary {
|
||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
|
||||
let expected_filename = self.filename().file_name();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!("warning: filename does not match what is expected from in-file summary");
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
"warning: filename does not match what is expected from in-file summary"
|
||||
);
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(loaded)
|
||||
inner.index_start_blk = actual_summary.index_start_blk;
|
||||
inner.index_root_blk = actual_summary.index_root_blk;
|
||||
inner.loaded = true;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk
|
||||
@@ -377,7 +417,12 @@ impl ImageLayer {
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: filename.lsn,
|
||||
access_stats,
|
||||
inner: OnceCell::new(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -404,7 +449,12 @@ impl ImageLayer {
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: OnceCell::new(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
file: None,
|
||||
loaded: false,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -423,65 +473,6 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
pub(super) fn load(
|
||||
path: &std::path::Path,
|
||||
lsn: Lsn,
|
||||
summary: Option<Summary>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
if let Some(mut expected_summary) = summary {
|
||||
// production code path
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!(
|
||||
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
actual_summary,
|
||||
expected_summary
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
lsn,
|
||||
file,
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader.get(&keybuf).await? {
|
||||
let blob = file
|
||||
.block_cursor()
|
||||
.read_blob(offset)
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new image layer.
|
||||
///
|
||||
/// Usage:
|
||||
@@ -624,7 +615,12 @@ impl ImageLayerWriterInner {
|
||||
desc,
|
||||
lsn: self.lsn,
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: OnceCell::new(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
}),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
@@ -664,7 +660,7 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
@@ -110,7 +110,6 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
@@ -133,7 +132,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
@@ -151,7 +150,7 @@ impl Layer for InMemoryLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
let mut cursor = inner.file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
@@ -184,7 +183,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
async fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -196,7 +195,7 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let reader = inner.file.block_cursor();
|
||||
let mut reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
@@ -354,7 +353,7 @@ impl InMemoryLayer {
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let cursor = inner.file.block_cursor();
|
||||
let mut cursor = inner.file.block_cursor();
|
||||
|
||||
let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
|
||||
keys.sort_by_key(|k| k.0);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user