Compare commits

..

1 Commits

Author SHA1 Message Date
Konstantin Knizhnik
8e3f42e0ba Add multixact test reproducing the problem with duplicates caused by incorrect opffset calculation 2023-07-21 22:40:47 +03:00
179 changed files with 4966 additions and 11361 deletions

View File

@@ -21,5 +21,4 @@
!workspace_hack/
!neon_local/
!scripts/ninstall.sh
!scripts/combine_control_files.py
!vm-cgconfig.conf

View File

@@ -1,20 +1,7 @@
name: 'Create Allure report'
description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
inputs:
store-test-results-into-db:
description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
type: boolean
required: false
default: false
outputs:
base-url:
description: 'Base URL for Allure report'
value: ${{ steps.generate-report.outputs.base-url }}
base-s3-url:
description: 'Base S3 URL for Allure report'
value: ${{ steps.generate-report.outputs.base-s3-url }}
report-url:
description: 'Allure report URL'
value: ${{ steps.generate-report.outputs.report-url }}
@@ -76,8 +63,8 @@ runs:
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.23.1
ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
ALLURE_VERSION: 2.22.1
ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
# Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
- name: Acquire lock
@@ -115,11 +102,6 @@ runs:
REPORT_PREFIX=reports/${BRANCH_OR_PR}
RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}
BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}
REPORT_URL=${BASE_URL}/index.html
REPORT_JSON_URL=${BASE_URL}/data/suites.json
# Get previously uploaded data for this run
ZSTD_NBTHREADS=0
@@ -128,9 +110,10 @@ runs:
# There's no previously uploaded data for this $GITHUB_RUN_ID
exit 0
fi
for S3_FILEPATH in ${S3_FILEPATHS}; do
time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"
time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/"
for archive in $(find ${WORKDIR} -name "*.tar.zst"); do
archive=${WORKDIR}/$(basename $S3_FILEPATH)
mkdir -p ${archive%.tar.zst}
time tar -xf ${archive} -C ${archive%.tar.zst}
rm -f ${archive}
@@ -147,10 +130,9 @@ runs:
# Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
# and to keep files on the host to upload them to the database
time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
# Generate redirect
cat <<EOF > ${WORKDIR}/index.html
@@ -162,10 +144,8 @@ runs:
EOF
time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
echo "base-url=${BASE_URL}" >> $GITHUB_OUTPUT
echo "base-s3-url=${BASE_S3_URL}" >> $GITHUB_OUTPUT
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT
echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
@@ -179,41 +159,6 @@ runs:
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
fi
- name: Store Allure test stat in the DB
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
shell: bash -euxo pipefail {0}
env:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
run: |
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
./scripts/pysync
poetry run python3 scripts/ingest_regress_test_result.py \
--revision ${COMMIT_SHA} \
--reference ${GITHUB_REF} \
--build-type unified \
--ingest ${WORKDIR}/report/data/suites.json
- name: Store Allure test stat in the DB (new)
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
shell: bash -euxo pipefail {0}
env:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
run: |
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
./scripts/pysync
poetry run python3 scripts/ingest_regress_test_result-new-format.py \
--reference ${GITHUB_REF} \
--revision ${COMMIT_SHA} \
--run-id ${GITHUB_RUN_ID} \
--run-attempt ${GITHUB_RUN_ATTEMPT} \
--test-cases-dir ${WORKDIR}/report/data/test-cases
- name: Cleanup
if: always()
shell: bash -euxo pipefail {0}

View File

@@ -31,7 +31,7 @@ runs:
BUCKET=neon-github-public-dev
FILENAME=$(basename $ARCHIVE)
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
echo 'SKIPPED=true' >> $GITHUB_OUTPUT

View File

@@ -209,4 +209,4 @@ runs:
uses: ./.github/actions/allure-report-store
with:
report-dir: /tmp/test_output/allure/results
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
unique-key: ${{ inputs.build_type }}

View File

@@ -432,11 +432,6 @@ jobs:
if: ${{ !cancelled() }}
id: create-allure-report
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
env:
REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- uses: actions/github-script@v6
if: ${{ !cancelled() }}
@@ -457,6 +452,25 @@ jobs:
report,
})
- name: Store Allure test stat in the DB
if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
env:
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
run: |
./scripts/pysync
curl --fail --output suites.json "${REPORT_JSON_URL}"
export BUILD_TYPE=unified
export DATABASE_URL="$TEST_RESULT_CONNSTR"
poetry run python3 scripts/ingest_regress_test_result.py \
--revision ${COMMIT_SHA} \
--reference ${GITHUB_REF} \
--build-type ${BUILD_TYPE} \
--ingest suites.json
coverage-report:
runs-on: [ self-hosted, gen3, small ]
container:
@@ -780,7 +794,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.16.2
VM_BUILDER_VERSION: v0.13.1
steps:
- name: Checkout
@@ -801,12 +815,7 @@ jobs:
- name: Build vm image
run: |
./vm-builder \
-enable-file-cache \
-enable-monitor \
-enable-informant \
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image
run: |
@@ -946,15 +955,22 @@ jobs:
version: [ v14, v15 ]
env:
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
# While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
# Later all the extensions will be moved to extensions image.
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
S3_BUCKETS: |
${{ github.ref_name == 'release' &&
'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
steps:
- name: Pull postgres-extensions image
run: |
docker pull ${EXTENSIONS_IMAGE}
docker pull ${COMPUTE_NODE_IMAGE}
- name: Create postgres-extensions container
id: create-container
@@ -962,23 +978,46 @@ jobs:
EID=$(docker create ${EXTENSIONS_IMAGE} true)
echo "EID=${EID}" >> $GITHUB_OUTPUT
CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
echo "CID=${CID}" >> $GITHUB_OUTPUT
- name: Extract postgres-extensions from container
run: |
rm -rf ./extensions-to-upload # Just in case
mkdir -p extensions-to-upload
rm -rf ./extensions-to-upload ./custom-extensions # Just in case
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
# In compute image we have a bit different directory layout
mkdir -p extensions-to-upload/share
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib
# Delete Neon extensitons (they always present on compute-node image)
rm -rf ./extensions-to-upload/share/extension/neon*
rm -rf ./extensions-to-upload/lib/neon*
# Delete leftovers from the extension build step
rm -rf ./extensions-to-upload/lib/pgxs
rm -rf ./extensions-to-upload/lib/pkgconfig
docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
for EXT_NAME in $(ls ./custom-extensions); do
mkdir -p ./extensions-to-upload/${EXT_NAME}/share
mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib
done
- name: Upload postgres-extensions to S3
# TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json)
if: false
run: |
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
for BUCKET in $(echo ${S3_BUCKETS}); do
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
done
- name: Cleanup
if: ${{ always() && steps.create-container.outputs.EID }}
if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
run: |
docker rm ${{ steps.create-container.outputs.CID }} || true
docker rm ${{ steps.create-container.outputs.EID }} || true
deploy:
@@ -1058,7 +1097,7 @@ jobs:
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1

118
Cargo.lock generated
View File

@@ -639,12 +639,6 @@ dependencies = [
"vsimd",
]
[[package]]
name = "base64ct"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "bincode"
version = "1.3.3"
@@ -746,9 +740,6 @@ name = "cc"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
dependencies = [
"jobserver",
]
[[package]]
name = "cexpr"
@@ -892,8 +883,6 @@ version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"regex",
"remote_storage",
"serde",
"serde_json",
"serde_with",
@@ -918,14 +907,12 @@ dependencies = [
"opentelemetry",
"postgres",
"regex",
"remote_storage",
"reqwest",
"serde",
"serde_json",
"tar",
"tokio",
"tokio-postgres",
"toml_edit",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -933,7 +920,6 @@ dependencies = [
"url",
"utils",
"workspace_hack",
"zstd",
]
[[package]]
@@ -994,7 +980,6 @@ dependencies = [
"tar",
"thiserror",
"toml",
"tracing",
"url",
"utils",
"workspace_hack",
@@ -1018,9 +1003,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
[[package]]
name = "cpufeatures"
version = "0.2.9"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
dependencies = [
"libc",
]
@@ -1200,15 +1185,15 @@ dependencies = [
[[package]]
name = "dashmap"
version = "5.5.0"
version = "5.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
dependencies = [
"cfg-if",
"hashbrown 0.14.0",
"hashbrown 0.12.3",
"lock_api",
"once_cell",
"parking_lot_core 0.9.8",
"parking_lot_core 0.9.7",
]
[[package]]
@@ -1657,12 +1642,6 @@ dependencies = [
"ahash",
]
[[package]]
name = "hashbrown"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
[[package]]
name = "hashlink"
version = "0.8.2"
@@ -1993,15 +1972,6 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
[[package]]
name = "jobserver"
version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
dependencies = [
"libc",
]
[[package]]
name = "js-sys"
version = "0.3.63"
@@ -2087,9 +2057,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
[[package]]
name = "lock_api"
version = "0.4.10"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
dependencies = [
"autocfg",
"scopeguard",
@@ -2353,9 +2323,9 @@ dependencies = [
[[package]]
name = "once_cell"
version = "1.18.0"
version = "1.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
[[package]]
name = "oorandom"
@@ -2536,7 +2506,6 @@ dependencies = [
"pageserver",
"postgres_ffi",
"svg_fmt",
"tokio",
"utils",
"workspace_hack",
]
@@ -2575,7 +2544,6 @@ dependencies = [
"metrics",
"nix",
"num-traits",
"num_cpus",
"once_cell",
"pageserver_api",
"pin-project-lite",
@@ -2654,7 +2622,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
dependencies = [
"lock_api",
"parking_lot_core 0.9.8",
"parking_lot_core 0.9.7",
]
[[package]]
@@ -2673,26 +2641,15 @@ dependencies = [
[[package]]
name = "parking_lot_core"
version = "0.9.8"
version = "0.9.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
dependencies = [
"cfg-if",
"libc",
"redox_syscall 0.3.5",
"redox_syscall 0.2.16",
"smallvec",
"windows-targets 0.48.0",
]
[[package]]
name = "password-hash"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
dependencies = [
"base64ct",
"rand_core",
"subtle",
"windows-sys 0.45.0",
]
[[package]]
@@ -2703,8 +2660,6 @@ checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
dependencies = [
"digest",
"hmac",
"password-hash",
"sha2",
]
[[package]]
@@ -2825,7 +2780,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"bytes",
"fallible-iterator",
@@ -2838,7 +2793,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"native-tls",
"tokio",
@@ -2849,7 +2804,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -2867,7 +2822,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"bytes",
"fallible-iterator",
@@ -3083,7 +3038,6 @@ dependencies = [
"chrono",
"clap",
"consumption_metrics",
"dashmap",
"futures",
"git-version",
"hashbrown 0.13.2",
@@ -3281,7 +3235,6 @@ dependencies = [
"metrics",
"once_cell",
"pin-project-lite",
"scopeguard",
"serde",
"serde_json",
"tempfile",
@@ -4359,7 +4312,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"async-trait",
"byteorder",
@@ -5341,7 +5294,6 @@ version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"cc",
"chrono",
"clap",
"clap_builder",
@@ -5442,33 +5394,3 @@ name = "zeroize"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
[[package]]
name = "zstd"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "6.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
dependencies = [
"libc",
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.8+zstd.1.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
dependencies = [
"cc",
"libc",
"pkg-config",
]

View File

@@ -54,7 +54,6 @@ comfy-table = "6.1"
const_format = "0.2"
crc32c = "0.6"
crossbeam-utils = "0.8.5"
dashmap = "5.5.0"
either = "1.8"
enum-map = "2.4.2"
enumset = "1.0.12"
@@ -89,7 +88,7 @@ opentelemetry = "0.19.0"
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.11.0"
parking_lot = "0.12"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pbkdf2 = "0.12.1"
pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
@@ -145,11 +144,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -184,7 +183,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
################# Binary contents sections

View File

@@ -51,7 +51,6 @@ RUN set -e \
--bin safekeeper \
--bin storage_broker \
--bin proxy \
--bin neon_local \
--locked --release \
&& cachepot -s
@@ -77,7 +76,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/

View File

@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
libicu-dev libxslt1-dev liblz4-dev libzstd-dev
#########################################################################################
#
@@ -77,7 +77,6 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
./autogen.sh && \
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -90,28 +89,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
mkdir -p /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir build && cd build && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
#########################################################################################
#
@@ -431,16 +419,12 @@ RUN apt-get update && \
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
mkdir build && cd build && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
#########################################################################################
#
@@ -551,8 +535,10 @@ FROM build-deps AS pg-embedding-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
# There is no release tag yet
RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -567,17 +553,16 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.ta
FROM build-deps AS pg-anon-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
find /usr/local/pgsql -type f | sort > /before.txt && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
find /usr/local/pgsql -type f | sort > /after.txt && \
/bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
#########################################################################################
#
@@ -769,23 +754,16 @@ RUN rm /usr/local/pgsql/lib/lib*.a
# Extenstion only
#
#########################################################################################
FROM python:3.9-slim-bullseye AS generate-ext-index
ARG PG_VERSION
ARG BUILD_TAG
RUN apt update && apt install -y zstd
# copy the control files here
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
COPY --from=pg-anon-pg-build /extensions/ /extensions/
COPY --from=postgis-build /extensions/ /extensions/
COPY scripts/combine_control_files.py ./combine_control_files.py
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
FROM scratch AS postgres-extensions
# After the transition this layer will include all extensitons.
# As for now, it's only a couple for testing purposses
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
COPY --from=generate-ext-index /ext_index.json /ext_index.json
# As for now, it's only for new custom ones
#
# # Default extensions
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib
# Custom extensions
COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
#########################################################################################
#
@@ -816,7 +794,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost*, libfreetype6, and zlib1g for rdkit
# ca-certificates for communicating with s3 by compute_ctl
RUN apt update && \
apt install --no-install-recommends -y \
gdb \
@@ -840,8 +817,7 @@ RUN apt update && \
libcurl4-openssl-dev \
locales \
procps \
zlib1g \
ca-certificates && \
zlib1g && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
.PHONY: postgres-clean-%
postgres-clean-%:

View File

@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
libcurl4-openssl-dev openssl python-poetry
libcurl4-openssl-dev
```
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel libcurl-devel openssl poetry
protobuf-devel libcurl-devel
```
* On Arch based systems, these packages are needed:
```bash
@@ -235,13 +235,6 @@ CARGO_BUILD_FLAGS="--features=testing" make
./scripts/pytest
```
By default, this runs both debug and release modes, and all supported postgres versions. When
testing locally, it is convenient to run just run one set of permutations, like this:
```sh
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
```
## Documentation
[docs](/docs) Contains a top-level overview of all available markdown documentation.

View File

@@ -32,6 +32,3 @@ url.workspace = true
compute_api.workspace = true
utils.workspace = true
workspace_hack.workspace = true
toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
zstd = "0.12.4"

View File

@@ -5,8 +5,6 @@
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
//! - Every start is a fresh start, so the data directory is removed and
//! initialized again on each run.
//! - If remote_extension_config is provided, it will be used to fetch extensions list
//! and download `shared_preload_libraries` from the remote storage.
//! - Next it will put configuration files into the `PGDATA` directory.
//! - Sync safekeepers and get commit LSN.
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -29,8 +27,7 @@
//! compute_ctl -D /var/db/postgres/compute \
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres \
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
//! -b /usr/local/bin/postgres
//! ```
//!
use std::collections::HashMap;
@@ -38,7 +35,7 @@ use std::fs::File;
use std::panic;
use std::path::Path;
use std::process::exit;
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
use std::sync::{mpsc, Arc, Condvar, Mutex};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
@@ -51,33 +48,22 @@ use compute_api::responses::ComputeStatus;
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
// this is an arbitrary build tag. Fine as a default / for testing purposes
// in-case of not-set environment var
const BUILD_TAG_DEFAULT: &str = "5670669815";
const BUILD_TAG_DEFAULT: &str = "local";
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let build_tag = option_env!("BUILD_TAG")
.unwrap_or(BUILD_TAG_DEFAULT)
.to_string();
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
info!("build_tag: {build_tag}");
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
let ext_remote_storage = remote_ext_config.map(|x| {
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
});
let http_port = *matches
.get_one::<u16>("http-port")
@@ -142,12 +128,14 @@ fn main() -> Result<()> {
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
// Try to use just 'postgres' if no path is provided
let pgbin = matches.get_one::<String>("pgbin").unwrap();
let spec;
let mut live_config_allowed = false;
match spec_json {
// First, try to get cluster spec from the cli argument
Some(json) => {
info!("got spec from cli argument {}", json);
spec = Some(serde_json::from_str(json)?);
}
None => {
@@ -180,10 +168,8 @@ fn main() -> Result<()> {
let mut new_state = ComputeState::new();
let spec_set;
if let Some(spec) = spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
info!("new pspec.spec: {:?}", pspec.spec);
new_state.pspec = Some(pspec);
spec_set = true;
} else {
@@ -193,35 +179,20 @@ fn main() -> Result<()> {
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
pgversion: get_pg_version(pgbin),
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage,
ext_download_progress: RwLock::new(HashMap::new()),
build_tag,
};
let compute = Arc::new(compute_node);
// If this is a pooled VM, prewarm before starting HTTP server and becoming
// available for binding. Prewarming helps postgres start quicker later,
// because QEMU will already have it's memory allocated from the host, and
// the necessary binaries will alreaady be cached.
if !spec_set {
compute.prewarm_postgres()?;
}
// Launch http service first, so we were able to serve control-plane
// requests, while configuration is still in progress.
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
let mut state = compute.state.lock().unwrap();
while state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
@@ -258,7 +229,7 @@ fn main() -> Result<()> {
// Start Postgres
let mut delay_exit = false;
let mut exit_code = None;
let pg = match compute.start_compute(extension_server_port) {
let pg = match compute.start_compute() {
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:?}", err);
@@ -387,12 +358,6 @@ fn cli() -> clap::Command {
.long("control-plane-uri")
.value_name("CONTROL_PLANE_API_BASE_URI"),
)
.arg(
Arg::new("remote-ext-config")
.short('r')
.long("remote-ext-config")
.value_name("REMOTE_EXT_CONFIG"),
)
}
#[test]

View File

@@ -1,22 +1,16 @@
use std::collections::HashMap;
use std::fs;
use std::io::BufRead;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::{Condvar, Mutex, RwLock};
use std::time::Instant;
use std::sync::{Condvar, Mutex};
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use postgres::{Client, NoTls};
use tokio;
use tokio_postgres;
use tracing::{error, info, instrument, warn};
use tracing::{info, instrument, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -24,12 +18,9 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use crate::config;
use crate::pg_helpers::*;
use crate::spec::*;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
use crate::{config, extension_server};
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
@@ -37,7 +28,6 @@ pub struct ComputeNode {
pub connstr: url::Url,
pub pgdata: String,
pub pgbin: String,
pub pgversion: String,
/// We should only allow live re- / configuration of the compute node if
/// it uses 'pull model', i.e. it can go to control-plane and fetch
/// the latest configuration. Otherwise, there could be a case:
@@ -57,19 +47,6 @@ pub struct ComputeNode {
pub state: Mutex<ComputeState>,
/// `Condvar` to allow notifying waiters about state changes.
pub state_changed: Condvar,
/// the S3 bucket that we search for extensions in
pub ext_remote_storage: Option<GenericRemoteStorage>,
// key: ext_archive_name, value: started download time, download_completed?
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
pub build_tag: String,
}
// store some metrics about download size that might impact startup time
#[derive(Clone, Debug)]
pub struct RemoteExtensionMetrics {
num_ext_downloaded: u64,
largest_ext_size: u64,
total_ext_download_size: u64,
}
#[derive(Clone, Debug)]
@@ -109,7 +86,6 @@ pub struct ParsedSpec {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub pageserver_connstr: String,
pub safekeeper_connstrings: Vec<String>,
pub storage_auth_token: Option<String>,
}
@@ -127,21 +103,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
.clone()
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
.ok_or("pageserver connstr should be provided")?;
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
if matches!(spec.mode, ComputeMode::Primary) {
spec.cluster
.settings
.find("neon.safekeepers")
.ok_or("safekeeper connstrings should be provided")?
.split(',')
.map(|str| str.to_string())
.collect()
} else {
vec![]
}
} else {
spec.safekeeper_connstrings.clone()
};
let storage_auth_token = spec.storage_auth_token.clone();
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
tenant_id
@@ -167,7 +128,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
Ok(ParsedSpec {
spec,
pageserver_connstr,
safekeeper_connstrings,
storage_auth_token,
tenant_id,
timeline_id,
@@ -280,7 +240,7 @@ impl ComputeNode {
#[instrument(skip_all, fields(%lsn))]
fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let start_time = Instant::now();
let start_time = Utc::now();
let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
@@ -293,10 +253,7 @@ impl ComputeNode {
info!("Storage auth token not set");
}
// Connect to pageserver
let mut client = config.connect(NoTls)?;
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
let basebackup_cmd = match lsn {
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
@@ -342,107 +299,14 @@ impl ComputeNode {
};
// Report metrics
let mut state = self.state.lock().unwrap();
state.metrics.pageserver_connect_micros = pageserver_connect_micros;
state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
Ok(())
}
pub async fn check_safekeepers_synced_async(
&self,
compute_state: &ComputeState,
) -> Result<Option<Lsn>> {
// Construct a connection config for each safekeeper
let pspec: ParsedSpec = compute_state
.pspec
.as_ref()
.expect("spec must be set")
.clone();
let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
let sk_configs = sk_connstrs.into_iter().map(|connstr| {
// Format connstr
let id = connstr.clone();
let connstr = format!("postgresql://no_user@{}", connstr);
let options = format!(
"-c timeline_id={} tenant_id={}",
pspec.timeline_id, pspec.tenant_id
);
// Construct client
let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
config.options(&options);
if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
config.password(storage_auth_token);
}
(id, config)
});
// Create task set to query all safekeepers
let mut tasks = FuturesUnordered::new();
let quorum = sk_configs.len() / 2 + 1;
for (id, config) in sk_configs {
let timeout = tokio::time::Duration::from_millis(100);
let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
tasks.push(tokio::spawn(task));
}
// Get a quorum of responses or errors
let mut responses = Vec::new();
let mut join_errors = Vec::new();
let mut task_errors = Vec::new();
let mut timeout_errors = Vec::new();
while let Some(response) = tasks.next().await {
match response {
Ok(Ok(Ok(r))) => responses.push(r),
Ok(Ok(Err(e))) => task_errors.push(e),
Ok(Err(e)) => timeout_errors.push(e),
Err(e) => join_errors.push(e),
};
if responses.len() >= quorum {
break;
}
if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
break;
}
}
// In case of error, log and fail the check, but don't crash.
// We're playing it safe because these errors could be transient
// and we don't yet retry. Also being careful here allows us to
// be backwards compatible with safekeepers that don't have the
// TIMELINE_STATUS API yet.
if responses.len() < quorum {
error!(
"failed sync safekeepers check {:?} {:?} {:?}",
join_errors, task_errors, timeout_errors
);
return Ok(None);
}
Ok(check_if_synced(responses))
}
// Fast path for sync_safekeepers. If they're already synced we get the lsn
// in one roundtrip. If not, we should do a full sync_safekeepers.
pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
let start_time = Utc::now();
// Run actual work with new tokio runtime
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create rt");
let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
// Record runtime
self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
self.state.lock().unwrap().metrics.basebackup_bytes =
measured_reader.get_byte_count() as u64;
self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
.signed_duration_since(start_time)
.to_std()
.unwrap()
.as_millis() as u64;
result
Ok(())
}
// Run `postgres` in a special mode with `--sync-safekeepers` argument
@@ -493,36 +357,24 @@ impl ComputeNode {
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip_all)]
pub fn prepare_pgdata(
&self,
compute_state: &ComputeState,
extension_server_port: u16,
) -> Result<()> {
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
let pgdata_path = Path::new(&self.pgdata);
// Remove/create an empty pgdata directory and put configuration there.
self.create_pgdata()?;
config::write_postgres_conf(
&pgdata_path.join("postgresql.conf"),
&pspec.spec,
Some(extension_server_port),
)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
// Syncing safekeepers is only safe with primary nodes: if a primary
// is already connected it will be kicked out, so a secondary (standby)
// cannot sync safekeepers.
let lsn = match spec.mode {
ComputeMode::Primary => {
info!("checking if safekeepers are synced");
let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
lsn
} else {
info!("starting safekeepers syncing");
self.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?
};
info!("starting safekeepers syncing");
let lsn = self
.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?;
info!("safekeepers synced at LSN {}", lsn);
lsn
}
@@ -560,50 +412,6 @@ impl ComputeNode {
Ok(())
}
/// Start and stop a postgres process to warm up the VM for startup.
pub fn prewarm_postgres(&self) -> Result<()> {
info!("prewarming");
// Create pgdata
let pgdata = &format!("{}.warmup", self.pgdata);
create_pgdata(pgdata)?;
// Run initdb to completion
info!("running initdb");
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
Command::new(initdb_bin)
.args(["-D", pgdata])
.output()
.expect("cannot start initdb process");
// Write conf
use std::io::Write;
let conf_path = Path::new(pgdata).join("postgresql.conf");
let mut file = std::fs::File::create(conf_path)?;
writeln!(file, "shared_buffers=65536")?;
writeln!(file, "port=51055")?; // Nobody should be connecting
writeln!(file, "shared_preload_libraries = 'neon'")?;
// Start postgres
info!("starting postgres");
let mut pg = Command::new(&self.pgbin)
.args(["-D", pgdata])
.spawn()
.expect("cannot start postgres process");
// Stop it when it's ready
info!("waiting for postgres");
wait_for_postgres(&mut pg, Path::new(pgdata))?;
pg.kill()?;
info!("sent kill signal");
pg.wait()?;
info!("done prewarming");
// clean up
let _ok = fs::remove_dir_all(pgdata);
Ok(())
}
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
#[instrument(skip_all)]
@@ -698,7 +506,7 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.pgdata);
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
self.pg_reload_conf(&mut client)?;
@@ -728,7 +536,7 @@ impl ComputeNode {
}
#[instrument(skip_all)]
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
pub fn start_compute(&self) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
@@ -739,38 +547,7 @@ impl ComputeNode {
pspec.timeline_id,
);
info!(
"start_compute spec.remote_extensions {:?}",
pspec.spec.remote_extensions
);
// This part is sync, because we need to download
// remote shared_preload_libraries before postgres start (if any)
if let Some(remote_extensions) = &pspec.spec.remote_extensions {
// First, create control files for all availale extensions
extension_server::create_control_files(remote_extensions, &self.pgbin);
let library_load_start_time = Utc::now();
let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;
let library_load_time = Utc::now()
.signed_duration_since(library_load_start_time)
.to_std()
.unwrap()
.as_millis() as u64;
let mut state = self.state.lock().unwrap();
state.metrics.load_ext_ms = library_load_time;
state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
info!(
"Loading shared_preload_libraries took {:?}ms",
library_load_time
);
info!("{:?}", remote_ext_metrics);
}
self.prepare_pgdata(&compute_state, extension_server_port)?;
self.prepare_pgdata(&compute_state)?;
let start_time = Utc::now();
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -918,172 +695,4 @@ LIMIT 100",
"{{\"pg_stat_statements\": []}}".to_string()
}
}
// download an archive, unzip and place files in correct locations
pub async fn download_extension(
&self,
real_ext_name: String,
ext_path: RemotePath,
) -> Result<u64, DownloadError> {
let remote_storage = self
.ext_remote_storage
.as_ref()
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
"Remote extensions storage is not configured",
)))?;
let ext_archive_name = ext_path.object_name().expect("bad path");
let mut first_try = false;
if !self
.ext_download_progress
.read()
.expect("lock err")
.contains_key(ext_archive_name)
{
self.ext_download_progress
.write()
.expect("lock err")
.insert(ext_archive_name.to_string(), (Utc::now(), false));
first_try = true;
}
let (download_start, download_completed) =
self.ext_download_progress.read().expect("lock err")[ext_archive_name];
let start_time_delta = Utc::now()
.signed_duration_since(download_start)
.to_std()
.unwrap()
.as_millis() as u64;
// how long to wait for extension download if it was started by another process
const HANG_TIMEOUT: u64 = 3000; // milliseconds
if download_completed {
info!("extension already downloaded, skipping re-download");
return Ok(0);
} else if start_time_delta < HANG_TIMEOUT && !first_try {
info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500));
loop {
info!("waiting for download");
interval.tick().await;
let (_, download_completed_now) =
self.ext_download_progress.read().expect("lock")[ext_archive_name];
if download_completed_now {
info!("download finished by whoever else downloaded it");
return Ok(0);
}
}
// NOTE: the above loop will get terminated
// based on the timeout of the download function
}
// if extension hasn't been downloaded before or the previous
// attempt to download was at least HANG_TIMEOUT ms ago
// then we try to download it here
info!("downloading new extension {ext_archive_name}");
let download_size = extension_server::download_extension(
&real_ext_name,
&ext_path,
remote_storage,
&self.pgbin,
)
.await
.map_err(DownloadError::Other);
self.ext_download_progress
.write()
.expect("bad lock")
.insert(ext_archive_name.to_string(), (download_start, true));
download_size
}
#[tokio::main]
pub async fn prepare_preload_libraries(
&self,
spec: &ComputeSpec,
) -> Result<RemoteExtensionMetrics> {
if self.ext_remote_storage.is_none() {
return Ok(RemoteExtensionMetrics {
num_ext_downloaded: 0,
largest_ext_size: 0,
total_ext_download_size: 0,
});
}
let remote_extensions = spec
.remote_extensions
.as_ref()
.ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
info!("parse shared_preload_libraries from spec.cluster.settings");
let mut libs_vec = Vec::new();
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
libs_vec = libs
.split(&[',', '\'', ' '])
.filter(|s| *s != "neon" && !s.is_empty())
.map(str::to_string)
.collect();
}
info!("parse shared_preload_libraries from provided postgresql.conf");
// that is used in neon_local and python tests
if let Some(conf) = &spec.cluster.postgresql_conf {
let conf_lines = conf.split('\n').collect::<Vec<&str>>();
let mut shared_preload_libraries_line = "";
for line in conf_lines {
if line.starts_with("shared_preload_libraries") {
shared_preload_libraries_line = line;
}
}
let mut preload_libs_vec = Vec::new();
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
preload_libs_vec = libs
.split(&[',', '\'', ' '])
.filter(|s| *s != "neon" && !s.is_empty())
.map(str::to_string)
.collect();
}
libs_vec.extend(preload_libs_vec);
}
// Don't try to download libraries that are not in the index.
// Assume that they are already present locally.
libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib));
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
let mut download_tasks = Vec::new();
for library in &libs_vec {
let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
download_tasks.push(self.download_extension(ext_name, ext_path));
}
let results = join_all(download_tasks).await;
let mut remote_ext_metrics = RemoteExtensionMetrics {
num_ext_downloaded: 0,
largest_ext_size: 0,
total_ext_download_size: 0,
};
for result in results {
let download_size = match result {
Ok(res) => {
remote_ext_metrics.num_ext_downloaded += 1;
res
}
Err(err) => {
// if we failed to download an extension, we don't want to fail the whole
// process, but we do want to log the error
error!("Failed to download extension: {}", err);
0
}
};
remote_ext_metrics.largest_ext_size =
std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
remote_ext_metrics.total_ext_download_size += download_size;
}
Ok(remote_ext_metrics)
}
}

View File

@@ -33,11 +33,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
}
/// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf(
path: &Path,
spec: &ComputeSpec,
extension_server_port: Option<u16>,
) -> Result<()> {
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
// File::create() destroys the file content if it exists.
let mut file = File::create(path)?;
@@ -91,9 +87,5 @@ pub fn write_postgres_conf(
writeln!(file, "# Managed by compute_ctl: end")?;
}
if let Some(port) = extension_server_port {
writeln!(file, "neon.extension_server_port={}", port)?;
}
Ok(())
}

View File

@@ -1,221 +0,0 @@
// Download extension files from the extension store
// and put them in the right place in the postgres directory (share / lib)
/*
The layout of the S3 bucket is as follows:
5615610098 // this is an extension build number
├── v14
│   ├── extensions
│   │   ├── anon.tar.zst
│   │   └── embedding.tar.zst
│   └── ext_index.json
└── v15
├── extensions
│   ├── anon.tar.zst
│   └── embedding.tar.zst
└── ext_index.json
5615261079
├── v14
│   ├── extensions
│   │   └── anon.tar.zst
│   └── ext_index.json
└── v15
├── extensions
│   └── anon.tar.zst
└── ext_index.json
5623261088
├── v14
│   ├── extensions
│   │   └── embedding.tar.zst
│   └── ext_index.json
└── v15
├── extensions
│   └── embedding.tar.zst
└── ext_index.json
Note that build number cannot be part of prefix because we might need extensions
from other build numbers.
ext_index.json stores the control files and location of extension archives
It also stores a list of public extensions and a library_index
We don't need to duplicate extension.tar.zst files.
We only need to upload a new one if it is updated.
(Although currently we just upload every time anyways, hopefully will change
this sometime)
*access* is controlled by spec
More specifically, here is an example ext_index.json
{
"public_extensions": [
"anon",
"pg_buffercache"
],
"library_index": {
"anon": "anon",
"pg_buffercache": "pg_buffercache"
},
"extension_data": {
"pg_buffercache": {
"control_data": {
"pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
},
"archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
},
"anon": {
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
},
"archive_path": "5670669815/v14/extensions/anon.tar.zst"
}
}
}
*/
use anyhow::Context;
use anyhow::{self, Result};
use compute_api::spec::RemoteExtSpec;
use remote_storage::*;
use serde_json;
use std::io::Read;
use std::num::{NonZeroU32, NonZeroUsize};
use std::path::Path;
use std::str;
use tar::Archive;
use tokio::io::AsyncReadExt;
use tracing::info;
use tracing::log::warn;
use zstd::stream::read::Decoder;
fn get_pg_config(argument: &str, pgbin: &str) -> String {
// gives the result of `pg_config [argument]`
// where argument is a flag like `--version` or `--sharedir`
let pgconfig = pgbin
.strip_suffix("postgres")
.expect("bad pgbin")
.to_owned()
+ "/pg_config";
let config_output = std::process::Command::new(pgconfig)
.arg(argument)
.output()
.expect("pg_config error");
std::str::from_utf8(&config_output.stdout)
.expect("pg_config error")
.trim()
.to_string()
}
pub fn get_pg_version(pgbin: &str) -> String {
// pg_config --version returns a (platform specific) human readable string
// such as "PostgreSQL 15.4". We parse this to v14/v15
let human_version = get_pg_config("--version", pgbin);
if human_version.contains("15") {
return "v15".to_string();
} else if human_version.contains("14") {
return "v14".to_string();
}
panic!("Unsuported postgres version {human_version}");
}
// download the archive for a given extension,
// unzip it, and place files in the appropriate locations (share/lib)
pub async fn download_extension(
ext_name: &str,
ext_path: &RemotePath,
remote_storage: &GenericRemoteStorage,
pgbin: &str,
) -> Result<u64> {
info!("Download extension {:?} from {:?}", ext_name, ext_path);
let mut download = remote_storage.download(ext_path).await?;
let mut download_buffer = Vec::new();
download
.download_stream
.read_to_end(&mut download_buffer)
.await?;
let download_size = download_buffer.len() as u64;
// it's unclear whether it is more performant to decompress into memory or not
// TODO: decompressing into memory can be avoided
let mut decoder = Decoder::new(download_buffer.as_slice())?;
let mut decompress_buffer = Vec::new();
decoder.read_to_end(&mut decompress_buffer)?;
let mut archive = Archive::new(decompress_buffer.as_slice());
let unzip_dest = pgbin
.strip_suffix("/bin/postgres")
.expect("bad pgbin")
.to_string()
+ "/download_extensions";
archive.unpack(&unzip_dest)?;
info!("Download + unzip {:?} completed successfully", &ext_path);
let sharedir_paths = (
unzip_dest.to_string() + "/share/extension",
Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
);
let libdir_paths = (
unzip_dest.to_string() + "/lib",
Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(),
);
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
for paths in [sharedir_paths, libdir_paths] {
let (zip_dir, real_dir) = paths;
info!("mv {zip_dir:?}/* {real_dir:?}");
for file in std::fs::read_dir(zip_dir)? {
let old_file = file?.path();
let new_file =
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
info!("moving {old_file:?} to {new_file:?}");
// extension download failed: Directory not empty (os error 39)
match std::fs::rename(old_file, new_file) {
Ok(()) => info!("move succeeded"),
Err(e) => {
warn!("move failed, probably because the extension already exists: {e}")
}
}
}
}
info!("done moving extension {ext_name}");
Ok(download_size)
}
// Create extension control files from spec
pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
for ext_data in remote_extensions.extension_data.values() {
for (control_name, control_content) in &ext_data.control_data {
let control_path = local_sharedir.join(control_name);
if !control_path.exists() {
info!("writing file {:?}{:?}", control_path, control_content);
std::fs::write(control_path, control_content).unwrap();
} else {
warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
}
}
}
}
// This function initializes the necessary structs to use remote storage
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
#[derive(Debug, serde::Deserialize)]
struct RemoteExtJson {
bucket: String,
region: String,
endpoint: Option<String>,
prefix: Option<String>,
}
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
let config = S3Config {
bucket_name: remote_ext_json.bucket,
bucket_region: remote_ext_json.region,
prefix_in_bucket: remote_ext_json.prefix,
endpoint: remote_ext_json.endpoint,
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
max_keys_per_list_response: None,
};
let config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
storage: RemoteStorageKind::AwsS3(config),
};
GenericRemoteStorage::from_config(&config)
}

View File

@@ -13,7 +13,7 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
use num_cpus;
use serde_json;
use tokio::task;
use tracing::{error, info, warn};
use tracing::{error, info};
use tracing_utils::http::OtelName;
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
@@ -121,78 +121,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
// download extension files from S3 on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
info!("req.uri {:?}", req.uri());
// don't even try to download extensions
// if no remote storage is configured
if compute.ext_remote_storage.is_none() {
info!("no extensions remote storage configured");
let mut resp = Response::new(Body::from("no remote storage configured"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
let mut is_library = false;
if let Some(params) = req.uri().query() {
info!("serving {:?} POST request with params: {}", route, params);
if params == "is_library=true" {
is_library = true;
} else {
let mut resp = Response::new(Body::from("Wrong request parameters"));
*resp.status_mut() = StatusCode::BAD_REQUEST;
return resp;
}
}
let filename = route.split('/').last().unwrap().to_string();
info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
// get ext_name and path from spec
// don't lock compute_state for too long
let ext = {
let compute_state = compute.state.lock().unwrap();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
// debug only
info!("spec: {:?}", spec);
let remote_extensions = match spec.remote_extensions.as_ref() {
Some(r) => r,
None => {
info!("no remote extensions spec was provided");
let mut resp = Response::new(Body::from("no remote storage configured"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
};
remote_extensions.get_ext(&filename, is_library)
};
match ext {
Ok((ext_name, ext_path)) => {
match compute.download_extension(ext_name, ext_path).await {
Ok(_) => Response::new(Body::from("OK")),
Err(e) => {
error!("extension download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
Err(e) => {
warn!("extension download failed to find extension: {}", e);
let mut resp = Response::new(Body::from("failed to find file"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
// Return the `404 Not Found` for any other routes.
_ => {
let mut not_found = Response::new(Body::from("404 Not Found"));

View File

@@ -139,34 +139,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/extension_server:
post:
tags:
- Extension
summary: Download extension from S3 to local folder.
description: ""
operationId: downloadExtension
responses:
200:
description: Extension downloaded
content:
text/plain:
schema:
type: string
description: Error text or 'OK' if download succeeded.
example: "OK"
400:
description: Request is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: Extension download request failed.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:

View File

@@ -9,9 +9,7 @@ pub mod http;
#[macro_use]
pub mod logger;
pub mod compute;
pub mod extension_server;
pub mod monitor;
pub mod params;
pub mod pg_helpers;
pub mod spec;
pub mod sync_sk;

View File

@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
// File `postgresql.conf` is no longer included into `basebackup`, so just
// always write all config into it creating new file.
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
update_pg_hba(pgdata_path)?;
@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
}
RoleAction::Create => {
let mut query: String = format!(
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
"CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
name.pg_quote()
);
info!("role create query: '{}'", &query);

View File

@@ -1,98 +0,0 @@
// Utils for running sync_safekeepers
use anyhow::Result;
use tracing::info;
use utils::lsn::Lsn;
#[derive(Copy, Clone, Debug)]
pub enum TimelineStatusResponse {
NotFound,
Ok(TimelineStatusOkResponse),
}
#[derive(Copy, Clone, Debug)]
pub struct TimelineStatusOkResponse {
flush_lsn: Lsn,
commit_lsn: Lsn,
}
/// Get a safekeeper's metadata for our timeline. The id is only used for logging
pub async fn ping_safekeeper(
id: String,
config: tokio_postgres::Config,
) -> Result<TimelineStatusResponse> {
// TODO add retries
// Connect
info!("connecting to {}", id);
let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
tokio::spawn(async move {
if let Err(e) = conn.await {
eprintln!("connection error: {}", e);
}
});
// Query
info!("querying {}", id);
let result = client.simple_query("TIMELINE_STATUS").await?;
// Parse result
info!("done with {}", id);
if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
use std::str::FromStr;
let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
});
Ok(response)
} else {
// Timeline doesn't exist
Ok(TimelineStatusResponse::NotFound)
}
}
/// Given a quorum of responses, check if safekeepers are synced at some Lsn
pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
// Check if all responses are ok
let ok_responses: Vec<TimelineStatusOkResponse> = responses
.iter()
.filter_map(|r| match r {
TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
_ => None,
})
.cloned()
.collect();
if ok_responses.len() < responses.len() {
info!(
"not synced. Only {} out of {} know about this timeline",
ok_responses.len(),
responses.len()
);
return None;
}
// Get the min and the max of everything
let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
let commit_max = commit.iter().max().unwrap();
let commit_min = commit.iter().min().unwrap();
let flush_max = flush.iter().max().unwrap();
let flush_min = flush.iter().min().unwrap();
// Check that all values are equal
if commit_min != commit_max {
info!("not synced. {:?} {:?}", commit_min, commit_max);
return None;
}
if flush_min != flush_max {
info!("not synced. {:?} {:?}", flush_min, flush_max);
return None;
}
// Check that commit == flush
if commit_max != flush_max {
info!("not synced. {:?} {:?}", commit_max, flush_max);
return None;
}
Some(*commit_max)
}

View File

@@ -32,4 +32,3 @@ utils.workspace = true
compute_api.workspace = true
workspace_hack.workspace = true
tracing.workspace = true

View File

@@ -658,8 +658,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
// If --safekeepers argument is given, use only the listed safekeeper nodes.
let safekeepers =
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -701,7 +699,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
_ => {}
}
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
endpoint.start(&auth_token, safekeepers)?;
} else {
let branch_name = sub_args
.get_one::<String>("branch-name")
@@ -745,7 +743,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
pg_version,
mode,
)?;
ep.start(&auth_token, safekeepers, remote_ext_config)?;
ep.start(&auth_token, safekeepers)?;
}
}
"stop" => {
@@ -825,16 +823,6 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
}
}
// Get list of options to append to safekeeper command invocation.
fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
init_match
.get_many::<String>("safekeeper-extra-opt")
.into_iter()
.flatten()
.map(|s| s.to_owned())
.collect()
}
fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let (sub_name, sub_args) = match sub_match.subcommand() {
Some(safekeeper_command_data) => safekeeper_command_data,
@@ -851,9 +839,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
match sub_name {
"start" => {
let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper.start(extra_opts) {
if let Err(e) = safekeeper.start() {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
@@ -878,8 +864,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
exit(1);
}
let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper.start(extra_opts) {
if let Err(e) = safekeeper.start() {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
@@ -906,7 +891,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![]) {
if let Err(e) = safekeeper.start() {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false);
exit(1);
@@ -969,14 +954,6 @@ fn cli() -> Command {
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
.short('e')
.long("safekeeper-extra-opt")
.num_args(1)
.action(ArgAction::Append)
.help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
.required(false);
let tenant_id_arg = Arg::new("tenant-id")
.long("tenant-id")
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
@@ -1026,12 +1003,6 @@ fn cli() -> Command {
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config")
.num_args(1)
.help("Configure the S3 bucket that we search for extensions in.")
.required(false);
let lsn_arg = Arg::new("lsn")
.long("lsn")
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1145,7 +1116,6 @@ fn cli() -> Command {
.subcommand(Command::new("start")
.about("Start local safekeeper")
.arg(safekeeper_id_arg.clone())
.arg(safekeeper_extra_opt_arg.clone())
)
.subcommand(Command::new("stop")
.about("Stop local safekeeper")
@@ -1156,7 +1126,6 @@ fn cli() -> Command {
.about("Restart local safekeeper")
.arg(safekeeper_id_arg)
.arg(stop_mode_arg.clone())
.arg(safekeeper_extra_opt_arg)
)
)
.subcommand(
@@ -1192,7 +1161,6 @@ fn cli() -> Command {
.arg(pg_version_arg)
.arg(hot_standby_arg)
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
)
.subcommand(
Command::new("stop")

View File

@@ -313,7 +313,7 @@ impl Endpoint {
// TODO: use future host field from safekeeper spec
// Pass the list of safekeepers to the replica so that it can connect to any of them,
// whichever is available.
// whichever is availiable.
let sk_ports = self
.env
.safekeepers
@@ -420,12 +420,7 @@ impl Endpoint {
Ok(())
}
pub fn start(
&self,
auth_token: &Option<String>,
safekeepers: Vec<NodeId>,
remote_ext_config: Option<&String>,
) -> Result<()> {
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
if self.status() == "running" {
anyhow::bail!("The endpoint is already running");
}
@@ -493,7 +488,6 @@ impl Endpoint {
pageserver_connstring: Some(pageserver_connstring),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions: None,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -525,11 +519,6 @@ impl Endpoint {
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
if let Some(remote_ext_config) = remote_ext_config {
cmd.args(["--remote-ext-config", remote_ext_config]);
}
let child = cmd.spawn()?;
// Write down the pid so we can wait for it when we want to stop
@@ -575,7 +564,9 @@ impl Endpoint {
}
Err(e) => {
if attempt == MAX_ATTEMPTS {
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
return Err(e).context(
"timed out waiting to connect to compute_ctl HTTP; last error: {e}",
);
}
}
}

View File

@@ -101,7 +101,7 @@ impl SafekeeperNode {
self.datadir_path().join("safekeeper.pid")
}
pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
pub fn start(&self) -> anyhow::Result<Child> {
print!(
"Starting safekeeper at '{}' in '{}'",
self.pg_connection_config.raw_address(),
@@ -161,28 +161,17 @@ impl SafekeeperNode {
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
if self.conf.auth_enabled {
let key_path_string = key_path
.to_str()
.with_context(|| {
format!("Key path {key_path:?} cannot be represented as a unicode string")
})?
.to_owned();
args.extend([
"--pg-auth-public-key-path".to_owned(),
key_path_string.clone(),
]);
args.extend([
"--pg-tenant-only-auth-public-key-path".to_owned(),
key_path_string.clone(),
]);
args.extend([
"--http-auth-public-key-path".to_owned(),
key_path_string.clone(),
"--auth-validation-public-key-path".to_owned(),
key_path
.to_str()
.with_context(|| {
format!("Key path {key_path:?} cannot be represented as a unicode string")
})?
.to_owned(),
]);
}
args.extend(extra_opts);
background_process::start_process(
&format!("safekeeper-{id}"),
&datadir,

View File

@@ -1,236 +0,0 @@
# Supporting custom user Extensions (Dynamic Extension Loading)
Created 2023-05-03
## Motivation
There are many extensions in the PostgreSQL ecosystem, and not all extensions
are of a quality that we can confidently support them. Additionally, our
current extension inclusion mechanism has several problems because we build all
extensions into the primary Compute image: We build the extensions every time
we build the compute image regardless of whether we actually need to rebuild
the image, and the inclusion of these extensions in the image adds a hard
dependency on all supported extensions - thus increasing the image size, and
with it the time it takes to download that image - increasing first start
latency.
This RFC proposes a dynamic loading mechanism that solves most of these
problems.
## Summary
`compute_ctl` is made responsible for loading extensions on-demand into
the container's file system for dynamically loaded extensions, and will also
make sure that the extensions in `shared_preload_libraries` are downloaded
before the compute node starts.
## Components
compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
## Requirements
Compute nodes with no extra extensions should not be negatively impacted by
the existence of support for many extensions.
Installing an extension into PostgreSQL should be easy.
Non-preloaded extensions shouldn't impact startup latency.
Uninstalled extensions shouldn't impact query latency.
A small latency penalty for dynamically loaded extensions is acceptable in
the first seconds of compute startup, but not in steady-state operations.
## Proposed implementation
### On-demand, JIT-loading of extensions
Before postgres starts we download
- control files for all extensions available to that compute node;
- all `shared_preload_libraries`;
After postgres is running, `compute_ctl` listens for requests to load files.
When PostgreSQL requests a file, `compute_ctl` downloads it.
PostgreSQL requests files in the following cases:
- When loading a preload library set in `local_preload_libraries`
- When explicitly loading a library with `LOAD`
- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
#### Summary
Pros:
- Startup is only as slow as it takes to load all (shared_)preload_libraries
- Supports BYO Extension
Cons:
- O(sizeof(extensions)) IO requirement for loading all extensions.
### Alternative solutions
1. Allow users to add their extensions to the base image
Pros:
- Easy to deploy
Cons:
- Doesn't scale - first start size is dependent on image size;
- All extensions are shared across all users: It doesn't allow users to
bring their own restrictive-licensed extensions
2. Bring Your Own compute image
Pros:
- Still easy to deploy
- User can bring own patched version of PostgreSQL
Cons:
- First start latency is O(sizeof(extensions image))
- Warm instance pool for skipping pod schedule latency is not feasible with
O(n) custom images
- Support channels are difficult to manage
3. Download all user extensions in bulk on compute start
Pros:
- Easy to deploy
- No startup latency issues for "clean" users.
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- Downloading all extensions in advance takes a lot of time, thus startup
latency issues
4. Store user's extensions in persistent storage
Pros:
- Easy to deploy
- No startup latency issues
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- EC2 instances have only limited number of attachments shared between EBS
volumes, direct-attached NVMe drives, and ENIs.
- Compute instance migration isn't trivially solved for EBS mounts (e.g.
the device is unavailable whilst moving the mount between instances).
- EBS can only mount on one instance at a time (except the expensive IO2
device type).
5. Store user's extensions in network drive
Pros:
- Easy to deploy
- Few startup latency issues
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- We'd need networked drives, and a lot of them, which would store many
duplicate extensions.
- **UNCHECKED:** Compute instance migration may not work nicely with
networked IOs
### Idea extensions
The extension store does not have to be S3 directly, but could be a Node-local
caching service on top of S3. This would reduce the load on the network for
popular extensions.
## Extension Storage implementation
The layout of the S3 bucket is as follows:
```
5615610098 // this is an extension build number
├── v14
│   ├── extensions
│   │   ├── anon.tar.zst
│   │   └── embedding.tar.zst
│   └── ext_index.json
└── v15
├── extensions
│   ├── anon.tar.zst
│   └── embedding.tar.zst
└── ext_index.json
5615261079
├── v14
│   ├── extensions
│   │   └── anon.tar.zst
│   └── ext_index.json
└── v15
├── extensions
│   └── anon.tar.zst
└── ext_index.json
5623261088
├── v14
│   ├── extensions
│   │   └── embedding.tar.zst
│   └── ext_index.json
└── v15
├── extensions
│   └── embedding.tar.zst
└── ext_index.json
```
Note that build number cannot be part of prefix because we might need extensions
from other build numbers.
`ext_index.json` stores the control files and location of extension archives.
It also stores a list of public extensions and a library_index
We don't need to duplicate `extension.tar.zst`` files.
We only need to upload a new one if it is updated.
(Although currently we just upload every time anyways, hopefully will change
this sometime)
*access* is controlled by spec
More specifically, here is an example ext_index.json
```
{
"public_extensions": [
"anon",
"pg_buffercache"
],
"library_index": {
"anon": "anon",
"pg_buffercache": "pg_buffercache"
// for more complex extensions like postgis
// we might have something like:
// address_standardizer: postgis
// postgis_tiger: postgis
},
"extension_data": {
"pg_buffercache": {
"control_data": {
"pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
},
"archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
},
"anon": {
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
},
"archive_path": "5670669815/v14/extensions/anon.tar.zst"
}
}
}
```
### How to add new extension to the Extension Storage?
Simply upload build artifacts to the S3 bucket.
Implement a CI step for that. Splitting it from compute-node-image build.
### How do we deal with extension versions and updates?
Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
This is needed to ensure that `/share` and `/lib` files are in sync.
For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
### Alternatives
For extensions written on trusted languages we can also adopt
`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
This will increase the amount supported extensions and decrease the amount of work required to support them.

View File

@@ -1,316 +0,0 @@
This is a copy from the [original Notion page](https://www.notion.so/neondatabase/Proposal-Pageserver-MVCC-S3-Storage-8a424c0c7ec5459e89d3e3f00e87657c?pvs=4), taken on 2023-08-16.
This is for archival mostly.
The RFC that we're likely to go with is https://github.com/neondatabase/neon/pull/4919.
---
# Proposal: Pageserver MVCC S3 Storage
tl;dr: this proposal enables Control Plane to attach a tenant to a new pageserver without being 100% certain that it has been detached from the old pageserver. This enables us to automate failover if a pageserver dies (no human in the loop).
# Problem Statement
The current Neon architecture requires the Control Plane to guarantee that a tenant is only attached to one pageserver at a time. If a tenant is attached to multiple pageservers simultaneously, the pageservers will overwrite each others changes in S3 for that tenant, resulting in data loss for that tenant.
The above imposes limitations on tenant relocation and future designs for high availability. For instance, Control Plane cannot relocate a tenant to another pageserver before it is 100% certain that the tenant is detached from the source pageserver. If the source pageserver is unresponsive, the tenant detach procedure cannot proceed, and Control Plane has no choice but to wait for either the source to become responsive again, or rely on a node failure detection mechanism to detect that the source pageserver is dead, and give permission to skip the detachment step. Either way, the tenant is unavailable for an extended period, and we have no means to improve it in the current architecture.
Note that there is no 100% correct node failure detection mechanism, and even techniques to accelerate failure detection, such as ********************************shoot-the-other-node-in-the-head,******************************** have their limits. So, we currently rely on humans as node failure detectors: they get alerted via PagerDuty, assess the situation under high stress, and make the decision. If they make the wrong call, or the apparent dead pageserver somehow resurrects later, well have data loss.
Also, by relying on humans, were [incurring needless unscalable toil](https://sre.google/sre-book/eliminating-toil/): as Neon grows, pageserver failures will become more and more frequent because our fleet grows. Each instance will need quick response time to minimize downtime for the affected tenants, which implies higher toil, higher resulting attrition, and/or higher personnel cost.
Lastly, there are foreseeable needs by operation and product such as zero-downtime relocation and automatic failover/HA. For such features, the ability to have a tenant purposefully or accidentally attached to more than one pageserver will greatly reduce risk of data loss, and improve availability.
# High-Level Idea
The core idea is to evolve the per-Tenant S3 state to an MVCC-like scheme, allowing multiple pageservers to operate on the same tenant S3 state without interference. To make changes to S3, pageservers acquire long-running transactions from Control Plane. After opening a transaction, Pageservers make PUTs directly against S3, but they keys include the transaction ID, so overwrites never happen. Periodically, pageservers talk back to Control Plane to commit their transaction. This is where Control Plane enforces strict linearizability, favoring availability over work-conservation: commit is only granted if no transaction started after the one thats requesting commit. Garbage collection is done through deadlists, and its simplified tremendously by above commit grant/reject policy.
Minimal changes are required for safekeepers to allow WAL for a single timeline be consumed by more than one pageserver without premature truncation.
**Above scheme makes it safe to attach tenants without a 100% correct node failure detection mechanism. Further, it makes it safe to interleave tenant-attachment to pageservers, unlocking new capabilities for (internal) product features:**
- **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**: if a pageserver is not reachable (network partition, hardware failure, overload) we want to spread its attached tenants to new pageservers to restore availability, within the range of *seconds*. We cannot afford gracious timeouts to maximize the probability that the unreachable pageserver has ceased writing to S3. This proposal enables us to attach the tenants to the replacement pageservers, and redirect their computes, without having to wait for confirmation that the unreachable pageserver has ceased writing to S3.
- **************************************Zero-Downtime Relocation:************************************** we want to be able to relocate tenants to different pageservers with minimized availability or a latency impact. This proposal enables us to attach the relocating Tenant to the destination Pageserver before detaching it from the source Pageserver. This can help minimize downtime because we can wait for the destination to catch up on WAL processing before redirecting Computes.
# Design
The core idea is to evolve the per-Tenant S3 state to a per-tenant MVCC-like scheme.
To make S3 changes for a given tenant, Pageserver requests a transaction ID from control plane for that tenant. Without a transaction ID, Pageserver does not write to S3.
Once Pageserver received a transaction ID it is allowed to produce new objects and overwrite objects created in this transaction. Pageserver is not allowed to delete any objects; instead, it marks the object as deleted by appending the key to the transactions deadlist for later deletion. Commits of transactions are serialized through Control Plane: when Pageserver wants to commit a transaction, it sends an RPC to Control Plane. Control Plane responds with a commit grant or commit reject message. Commit grant means that the transactions changes are now visible to subsequent transactions. Commit reject means that the transactions changes are not and never will be visible to another Pageserver instance, and the rejected Pageserver is to cease further activity on that tenant.
## ****************************************************Commit grant/reject policy****************************************************
For the purposes of Pageserver, we want **linearizability** of a tenants S3 state. Since our transactions are scoped per tenant, it is sufficient for linearizability to grant commit if and only if no other transaction has been started since the commit-requesting transaction started.
For example, consider the case of a single tenant, attached to Pageserver A. Pageserver A has an open transaction but becomes unresponsive. Control Plane decides to relocate the tenant to another Pageserver B. It need *not* wait for A to be 100%-certainly down before B can start uploading to S3 for that tenant. Instead, B can start a new transaction right away, make progress, and get commit grants; What about A? The transaction is RejectPending in Control Plane until A eventually becomes responsive again, tries to commit, gets a rejection, acknowledges it, and thus its transaction becomes RejectAcknowledge. If A is definitively dead, operator can also force-transition from state RejectPending to RejectAcknowledged. But critically, Control Plane doesnt have for As transaction to become RejectAcknowledge before attaching the tenant to B.
```mermaid
sequenceDiagram
participant CP
participant A
participant S3
participant B
CP -->> A: attach tenant
activate A
A -->> CP: start txn
CP -->> A: txn=23, last_committed_txn=22
Note over CP,A: network partition
CP --x A: heartbeat
CP --x A: heartbeat
Note over CP: relocate tenant to avoid downtime
CP -->> B: attach tenant
activate B
B -->> CP: start txn
Note over CP: mark A's txn 23 as RejectPending
CP -->> B: txn=24, last-committed txn is 22
B -->> S3: PUT X.layer.24<br>PUT index_part.json.24 referencing X.layer.24
B -->> CP: request commit
CP -->> B: granted
B -->> CP: start txn
CP -->> B: txn=25, last_committed_txn=22
A -->> S3: PUT Y.layer.23 <br> PUT index_part.json.23 referencing Y.layer.23
A --x CP: request commit
A --x CP: request commit
Note over CP,A: partition is over
A -->> CP: request commit
Note over CP: most recently started txn is 25, not 23, reject
CP -->> A: reject
A -->> CP: acknowledge reject
Note over CP: mark A's txn 23 as RejectAcknowledged
deactivate A
B -->> S3: PUT 000-FFF_X-Y.layer.**************25**************<br>...
deactivate B
```
If a Pageserver gets a rejection to a commit request, it acknowledges rejection and cedes further S3 uploads for the tenant, until it receives a `/detach` request for the tenant (control plane has most likely attached the tenant to another pageserver in the meantime).
In practice, Control Plane will probably extend the commit grant/reject schema above, taking into account the pageserver to which it last attached the tenant. In the above example, Control Plane could remember that the pageserver that is supposed to host the tenant is pageserver B, and reject start-txn and commit requests from pageserver A. It would also use such requests from A as a signal that A is reachable again, and retry the `/detach` .
<aside>
💡 A commit failure causes the tenant to become effectively `Broken`. Pageserver should persist this locally so it doesnt bother ControlPlane for a new txn when Pageserver is restarted.
</aside>
## ********************Visibility********************
We mentioned earlier that once a transaction commits, its changes are visible to subsequent transactions. But how does a given transaction know where to look for the data? There is no longer a single `index_part.json` per timeline, or a single `timelines/:timeline_id` prefix to look for; theyre all multi-versioned, suffixed by the txn number.
The solution is: at transaction start, Pageserver receives the last-committed transaction ID from Control Plane (`last_committed_txn` in the diagram). last_commited_txn is the upper bound for what is visible for the current transaction. Control Plane keeps track of each open transactions last_committed_txn for purposes of garbage collection (see later paragraph).
Equipped with last_committed_txn, Pageserver then discovers
- the current index part of a timeline at `tenants/:tenant_id/timelines/:timeline_id/index_part.json.$last_committed_txn`. The `index_part.json.$last_committed_txn` has the exact same contents as the current architectures index_part.json, i.e. full list of layers.
- the list of existent timelines as part of the `attach` RPC from CP;
There is no other S3 state per tenant, so, thats all the visibility required.
An alternative to receiving the list of existent timelines from CP is to introduce a proper **********SetOfTimelines********** object in S3, and multi-version it just like above. For example, we could have a `tenants/:tenant_id/timelines.json.$txn` file that references `index_part.json.$last_committed_txn` . It can be added later if more separation between CP and PS is desired.
So, the only MVCCed object types in this proposal are LayerFile and IndexPart (=individual timeline), but not the SetOfTimelines in a given tenant. Is this a problem? For example, the Pageservers garbage collection code needs to know the full set of timelines of a tenant. Otherwise itll make incorrect decisions. What if Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer thats still needed for branch T. Not a problem with this propsoal, because the effect of GC (i.e., layer deletion) is properly MVCCed.
## Longevity Of Transactions & Availability
Pageserver depends on Control Plane to start a new transaction. If ControlPlane is down, no new transactions can be started.
Pageservers commit transactions based on a maximum amount of uncommitted changes that have accumulated in S3. A lower maximum increases dependence and load on ControlPlane which decreases availability. A higher maximum risks losing more work in the event of failover; the work will have to be re-done in a new transaction on the new node.
Pageservers are persist the open txn id in local storage, so that they can resume the transaction after restart, without dependence on Control Plane.
## **Operations**
********PUTs:********
- **layer files**
- current architecture: layer files are supposed to be write-once, but actually, there are edge-cases where we PUT the same layer file name twice; namely if we PUT the file to S3 but crash before uploading the index part that references it; then detach + attach, and re-run compaction, which is non-deterministic.
- this proposal: with transactions, we can now upload layers and index_part.json concurrently, just need to make sure layer file upload is done before we request txn commit.
- **index part** upload: `index_part.json.$txn` may be created and subsequently overwritten multiple times in a transaction; it is an availability/work-loss trade-off how often to request a commit from CP.
**************DELETEs**************: for deletion, we maintain a deadlist per transaction. It is located at `tenants/:tenant_id/deadlist/deadlist.json.$txn`. It is PUT once before the pageserver requests requests commit, and not changed after sending request to commit. An object created in the current txn need not (but can) be on the deadlist — it can be DELETEd immediately because its not visible to other transactions. An example use case would be an L0 layer that gets compacted within one transaction; or, if we ever start MVCCing the set of timelines of a tenant, a short-lived branch that is created & destroyed within one transaction.
<aside>
**Deadlist Invariant:** if a an object is on a deadlist of transaction T, it is not referenced from anywhere else in the full state visible to T or any later started transaction > T.
</aside>
### Rationale For Deadlist.json
Given that this proposal only MVCCs layers and indexparts, one may ask why the deadlist isnt part of indexpart. The reason is to not lose generality: the deadlist is just a list of keys; it is not necessary to understand the data format of the versioned object to process the deadlist. This is important for garbage collection / vacuuming, which well come to in the next section.
## Garbage Collection / Vacuuming
After a transaction has reached reject-acknowledged state, Control Plane initiates a garbage collection procedure for the aborted transaction.
Control Plane is in the unique position about transaction states. Here is a sketch of the exact transaction states and what Control Plane keeps track of.
```
struct Tenant {
...
txns: HashMap<TxnId, Transaction>,
// the most recently started txn's id; only most recently sarted can win
next_winner_txn: Option<TxnId>,
}
struct Transaction {
id: TxnId, // immutable
last_committed_txn: TxnId, // immutable; the most recent txn in state `Committed`
// when self was started
pageserver_id: PageserverId,
state: enum {
Open,
Committed,
RejectPending,
RejectAcknowledged, // invariant: we know all S3 activity has ceded
GarbageCollected,
}
}
```
Object creations & deletions by a rejected transaction have never been visible to other transactions. That is true for both RejectPending and RejectAcknowledged states. The difference is that, in RejectPending, the pageserver may still be uploading to S3, whereas in RejectAcknowledged, Control Plane can be certain that all S3 activity in the name of that transaction has ceded. So, once a transaction reaches state RejectAcknowledged state, it is safe to DELETE all objects created by that transaction, and discard the transactions deadlists.
A transaction T in state Committed has subsequent transactions that may or may not reference the objects it created. None of the subsequent transaction can reference the objects on Ts deadlist, though, as per the Deadlist Invariant (see previous section).
So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged:
- Commited: delete objects on the deadlist.
- We dont need a LIST request here, the deadlist is sufficient. So, its really cheap.
- This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they dont matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection theyre destined to get anyways. 404s for RejectAcknowledged is handled below.
- RejectAcknowledged: delete all objects created in that txn, and discard deadlists.
- 404s / object-already-deleted type messages must be expected because of Committed garbage collection (see above)
- How to get this list of objects created in a txn? Open but solvable design question; Ideas:
- **Brute force**: within tenant prefix, search for all keys ending in `.$txn` and delete them.
- **WAL for PUTs**: before a txn PUTs an object, it logs to S3, or some other equivalently durable storage, that its going to do it. If we log to S3, this means we have to do an additional WAL PUT per “readl” PUT.
- ******************************LIST with reorged S3 layout (preferred one right now):****************************** layout S3 key space such that `$txn` comes first, i.e., `tenants/:tenant_id/$txn/timelines/:timeline_id/*.json.$txn` . That way, when we need to GC a RejectAcknowledged txn, we just LIST the entire `tenants/:tenant_id/$txn` prefix and delete it. The cost of GC for RejectAcknowledged transactions is thus proportional to the number of objects created in that transaction.
## Branches
This proposal only MVCCs layer files and and index_part.json, but leaves the tenant object not-MVCCed. We argued earlier that its fine to ignore this for now, because
1. Control Plane can act as source-of-truth for the set of timelines, and
2. The only operation that makes decision based on “set of timelines” is GC, which in turn only does layer deletions, and layer deletions ***are*** properly MVCCed.
Now that weve introduced garbage collection, lets elaborate a little more on (2). Recall our example from earlier: Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer thats still needed for branch T.
How does the MVCCing of layer files protect us here? If A decides to delete that layer, its just on As transactions deadlist, but still present in S3 and usable by B. If A commits first, B wont be able to commit and the layers in timeline T will be vacuumed. If B commits first, As deadlist is discarded and the layer continues to exist.
## Safekeeper Changes
We need to teach the safekeepers that there can be multiple pageservers requesting WAL for the same timeline, in order to prevent premature WAL truncation.
In the current architecture, the Safekeeper service currently assumes only one Pageserver and is allowed to prune WAL older than that Pageservers `remote_consistent_lsn`. Safekeeper currently learns the `remote_consistent_lsn` through the walreceiver protocol.
So, if we have a tenant attached to two pageservers at the same time, they will both try to stream WAL and the Safekeeper will get confused about which connections `remote_consistent_lsn` to use as a basis for WAL pruning.
What do we need to change to make it work? We need to make sure that the Safekeepers only prune WAL up to the `remote_consistent_lsn` of the last-committed transaction.
The straight-forward way to get it is to re-design WAL pruning as follows:
1. Pageserver reports remote_consistent_lsn as part of transaction commit to Control Plane.
2. Control Plane makes sure transaction state update is persisted.
3. Control Plane (asynchronous to transaction commit) reconciles with Safekeepers to ensure WAL pruning happens.
The above requires non-trivial changes, but, in the light of other planned projects such as restore-tenant-from-safekeeper-wal-backups, I think Control Plane will need to get involved in WAL pruning anyways.
# How This Proposal Unlocks Future Features
Let us revisit the example from the introduction where we were thinking about handling network partitions. Network partitions need to be solved first, because theyre unavoidable in distributed systems. We did that. Now lets see how we can solve actual product problems:
## **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**
The “Problem Statement” section outlined the current architectures problems with regards to network partitions or instance failure: it requires a 100% correct node-dead detector to make decisions, which doesnt exist in reality. We rely instead on human toil: an oncall engineer has to inspect the situation and make a decision, which may be incorrect and in any case take time in the order of minutes, which means equivalent downtime for users.
With this proposal, automatic failover for pageservers is trivial:
If a pageserver is unresponsive from Control Planes / Computes perspective, Control Plane does the following:
- attach all tenants of the unresponsive pageserver to new pageservers
- switch over these tenants computes immediately;
At this point, availability is restored and user pain relieved.
Whats left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above.
2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT):
1. Inspect the instance, investigate logs, understand root cause.
2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC.
3. Use below procedure to decomission pageserver.
### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive)
The solution, enabled by this proposal:
1. Ensure that pageservers S3 credentials are revoked so that it cannot make new uploads, which wouldnt be tracked anywhere.
2. Let enough time pass for the S3 credential revocation to propagate. Amazon doesnt give a guarantee here. As stated earlier, we can easily afford to wait here.
3. Mark all Open and RejectPending transactions of that pageserver as RejectAcknowledge.
Revocation of the S3 credentials is required so that, once we transition all the transactions of that pageserver to RejectAcknowledge, once garbage-collection pass is guaranteed to delete all objects that will ever exist for that pageserver. That way, we need not check *****GarbageCollected***** transactions every again.
## Workflow: Zero-Downtime Relocation
With zero-downtime relocation, the goal is to have the target pageserver warmed up, i.e., at the same `last_record_lsn` as the source pageserver, before switching over Computes from source to target pageserver.
With this proposal, it works like so:
1. Grant source pageserver its last open transaction. This one is doomed to be rejected later, unless the relocation fails.
2. Grant target pageserver its first open transaction.
3. Have target pageserver catch up on WAL, streaming from last-committed-txns remote_consistent_lsn onwards.
4. Once target pageserver reports `last_record_lsn` close enough to source pageserver, target pageserver requests commit.
5. Drain compute traffic from source to target pageserver. (Source can still answer requests until it tries to commit and gets reject, so, this will be quite smooth).
Note that as soon as we complete step (4), the source pageservers transaction is doomed to be rejected later. Conversely, if the target cant catch up fast enough, the source will make a transaction commit earlier. This will generally happen if there is a lot of write traffic coming in. The design space to make thing smooth here is large, but well explored in other areas of computing, e.g., VM live migration. We have all the important policy levers at hand, e.g.,
- delaying source commits if we see target making progress
- slowing down source consumption (need some signalling mechanism for it)
- slowing down compute wal generation
-
It doesnt really matter, whats important is that two pageservers can overlap.
# Additional Trade-Offs / Remarks Brought Up During Peer Review
This proposal was read by and discussed @Stas and @Dmitry Rodionov prior to publishing it with the broader team. (This does not mean they endorse this proposal!).
Issues that we discussed:
1. **Frequency of transactions:** If even idle tenants commit every 10min or so, thats quite a lot of load on Control Plane. Can we minimize it by Equating Transaction Commit Period to Attachment Period? I.e. start txn on attach, commit on detach?
1. Would be nice, but, if a tenant is attached for 1 month, then PS dies, we lose 1 month of work.
2. ⇒ my solution to this problem: Adjusted this proposal to make transaction commit frequency proportional to amount of uncommitted data.
1. Its ok to spend resources on active users, they pay us money to do it!
2. The amount of work per transaction is minimal.
1. In current Control Plane, its a small database transaction that is super unlikely to conflict with other transactions.
2. I have very little concerns about scalability of the commit workload on CP side because it's trivially horizontally scalable by sharding by tenant.
3. There's no super stringent availability requirement on control plane; if a txn can't commit because it can't reach the CP, PS can continue & retry in the background, speculating that it's CP downtime and not PS-partitioned-off scenario.
4. Without stringent availability requirement, there's flexibility for future changes to CP-side-implementation.
2. ************************************************Does this proposal address mirroring / no-performance-degradation failover ?************************************************
1. No it doesnt. It only provides the building block for attaching a tenant to a new pageserver without having to worry that the tenant is detached on the old pageserver.
2. A simple scheme to build no-performance-degradation failover on top of this proposal is to have an asynchronous read-only replica of a tenant on another pageserver in the same region.
3. Another more ambitious scheme to get no-performance-degradation would be [One-Pager: Layer File Spreading (Christian)](https://www.notion.so/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=21); this proposal would be used in layer file spreading for risk-free automation of TenantLeader failover, which hasnt been addressed Ithere.
4. In any way, failover would restart from an older S3 state, and need to re-ingest WAL before being able to server recently written pages.
1. Is that a show-stopper? I think not.
2. Is it suboptimal? Absolutely: if a pageserver instance fails, all its tenants will be distributed among the remaining pageservers (OK), and all these tenants will ask the safekeepers for WAL at the same time (BAD). So, pageserver instance failure will cause a load spike in safekeepers.
1. Personally I think thats an OK trade-off to make.
2. There are countless options to avoid / mitigate the load spike. E.g., pro-actively streaming WAL to the standby read-only replica.
3. ********************************************Does this proposal allow multiple writers for a tenant?********************************************
1. In abstract terms, this proposal provides a linearized history for a given S3 prefix.
2. In concrete terms, this proposal provides a linearized history per tenant.
3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history.
4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************
1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT
2. @Dmitry Rodionov :
3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment.

View File

@@ -10,9 +10,6 @@ chrono.workspace = true
serde.workspace = true
serde_with.workspace = true
serde_json.workspace = true
regex.workspace = true
utils = { path = "../utils" }
remote_storage = { version = "0.1", path = "../remote_storage/" }
workspace_hack.workspace = true

View File

@@ -68,45 +68,13 @@ where
/// Response of the /metrics.json API
#[derive(Clone, Debug, Default, Serialize)]
pub struct ComputeMetrics {
/// Time spent waiting in pool
pub wait_for_spec_ms: u64,
/// Time spent checking if safekeepers are synced
pub sync_sk_check_ms: u64,
/// Time spent syncing safekeepers (walproposer.c).
/// In most cases this should be zero.
pub sync_safekeepers_ms: u64,
/// Time it took to establish a pg connection to the pageserver.
/// This is two roundtrips, so it's a good proxy for compute-pageserver
/// latency. The latency is usually 0.2ms, but it's not safe to assume
/// that.
pub pageserver_connect_micros: u64,
/// Time to get basebackup from pageserver and write it to disk.
pub basebackup_ms: u64,
/// Compressed size of basebackup received.
pub basebackup_bytes: u64,
/// Time spent starting potgres. This includes initialization of shared
/// buffers, preloading extensions, and other pg operations.
pub start_postgres_ms: u64,
/// Time spent applying pg catalog updates that were made in the console
/// UI. This should be 0 when startup time matters, since cplane tries
/// to do these updates eagerly, and passes the skip_pg_catalog_updates
/// when it's safe to skip this step.
pub config_ms: u64,
/// Total time, from when we receive the spec to when we're ready to take
/// pg connections.
pub total_startup_ms: u64,
pub load_ext_ms: u64,
pub num_ext_downloaded: u64,
pub largest_ext_size: u64, // these are measured in bytes
pub total_ext_download_size: u64,
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.

View File

@@ -3,16 +3,11 @@
//! The spec.json file is used to pass information to 'compute_ctl'. It contains
//! all the information needed to start up the right version of PostgreSQL,
//! and connect it to the storage nodes.
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use regex::Regex;
use remote_storage::RemotePath;
/// String type alias representing Postgres identifier and
/// intended to be used for DB / role names.
pub type PgIdent = String;
@@ -65,56 +60,6 @@ pub struct ComputeSpec {
/// If set, 'storage_auth_token' is used as the password to authenticate to
/// the pageserver and safekeepers.
pub storage_auth_token: Option<String>,
// information about available remote extensions
pub remote_extensions: Option<RemoteExtSpec>,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct RemoteExtSpec {
pub public_extensions: Option<Vec<String>>,
pub custom_extensions: Option<Vec<String>>,
pub library_index: HashMap<String, String>,
pub extension_data: HashMap<String, ExtensionData>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ExtensionData {
pub control_data: HashMap<String, String>,
pub archive_path: String,
}
impl RemoteExtSpec {
pub fn get_ext(
&self,
ext_name: &str,
is_library: bool,
) -> anyhow::Result<(String, RemotePath)> {
let mut real_ext_name = ext_name;
if is_library {
// sometimes library names might have a suffix like
// library.so or library.so.3. We strip this off
// because library_index is based on the name without the file extension
let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
real_ext_name = self
.library_index
.get(&lib_raw_name)
.ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
}
match self.extension_data.get(real_ext_name) {
Some(ext_data) => Ok((
real_ext_name.to_string(),
RemotePath::from_string(&ext_data.archive_path)?,
)),
None => Err(anyhow::anyhow!(
"real_ext_name {} is not found",
real_ext_name
)),
}
}
}
#[serde_as]

View File

@@ -205,43 +205,5 @@
"name": "zenith new",
"new_name": "zenith \"new\""
}
],
"remote_extensions": {
"library_index": {
"anon": "anon",
"postgis-3": "postgis",
"libpgrouting-3.4": "postgis",
"postgis_raster-3": "postgis",
"postgis_sfcgal-3": "postgis",
"postgis_topology-3": "postgis",
"address_standardizer-3": "postgis"
},
"extension_data": {
"anon": {
"archive_path": "5834329303/v15/extensions/anon.tar.zst",
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
}
},
"postgis": {
"archive_path": "5834329303/v15/extensions/postgis.tar.zst",
"control_data": {
"postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
"pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
"postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
"postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
"postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
"address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
"postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
"address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
}
}
},
"custom_extensions": [
"anon"
],
"public_extensions": [
"postgis"
]
}
]
}

View File

@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
use rand::Rng;
use serde::Serialize;
#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
#[serde(tag = "type")]
pub enum EventType {
#[serde(rename = "absolute")]
@@ -17,32 +17,6 @@ pub enum EventType {
},
}
impl EventType {
pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
use EventType::*;
match self {
Absolute { time } => Some(time),
_ => None,
}
}
pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
// these can most likely be thought of as Range or RangeFull
use EventType::*;
match self {
Incremental {
start_time,
stop_time,
} => Some(start_time..stop_time),
_ => None,
}
}
pub fn is_incremental(&self) -> bool {
matches!(self, EventType::Incremental { .. })
}
}
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct Event<Extra> {
#[serde(flatten)]
@@ -57,7 +31,7 @@ pub struct Event<Extra> {
pub extra: Extra,
}
pub fn idempotency_key(node_id: &str) -> String {
pub fn idempotency_key(node_id: String) -> String {
format!(
"{}-{}-{:04}",
Utc::now(),
@@ -71,6 +45,6 @@ pub const CHUNK_SIZE: usize = 1000;
// Just a wrapper around a slice of events
// to serialize it as `{"events" : [ ] }
#[derive(serde::Serialize)]
pub struct EventChunk<'a, T: Clone> {
pub events: std::borrow::Cow<'a, [T]>,
pub struct EventChunk<'a, T> {
pub events: &'a [T],
}

View File

@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
#[derive(Debug)]
pub struct FeCloseMessage;
/// An error occurred while parsing or serializing raw stream into Postgres
/// An error occured while parsing or serializing raw stream into Postgres
/// messages.
#[derive(thiserror::Error, Debug)]
pub enum ProtocolError {

View File

@@ -20,7 +20,6 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
tokio-util.workspace = true
toml_edit.workspace = true
tracing.workspace = true
scopeguard.workspace = true
metrics.workspace = true
utils.workspace = true
pin-project-lite.workspace = true

View File

@@ -65,10 +65,6 @@ impl RemotePath {
Ok(Self(relative_path.to_path_buf()))
}
pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
Self::new(Path::new(relative_path))
}
pub fn with_base(&self, base_path: &Path) -> PathBuf {
base_path.join(&self.0)
}
@@ -194,20 +190,6 @@ pub enum GenericRemoteStorage {
}
impl GenericRemoteStorage {
// A function for listing all the files in a "directory"
// Example:
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
match self {
Self::LocalFs(s) => s.list_files(folder).await,
Self::AwsS3(s) => s.list_files(folder).await,
Self::Unreliable(s) => s.list_files(folder).await,
}
}
// lists common *prefixes*, if any of files
// Example:
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
pub async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
@@ -219,6 +201,14 @@ impl GenericRemoteStorage {
}
}
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
match self {
Self::LocalFs(s) => s.list_files(folder).await,
Self::AwsS3(s) => s.list_files(folder).await,
Self::Unreliable(s) => s.list_files(folder).await,
}
}
pub async fn upload(
&self,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,

View File

@@ -10,7 +10,6 @@ use anyhow::Context;
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
};
use aws_credential_types::cache::CredentialsCache;
use aws_sdk_s3::{
@@ -23,7 +22,6 @@ use aws_sdk_s3::{
};
use aws_smithy_http::body::SdkBody;
use hyper::Body;
use scopeguard::ScopeGuard;
use tokio::{
io::{self, AsyncRead},
sync::Semaphore,
@@ -38,9 +36,82 @@ use crate::{
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
pub(super) mod metrics;
pub(super) mod metrics {
use metrics::{register_int_counter_vec, IntCounterVec};
use once_cell::sync::Lazy;
use self::metrics::{AttemptOutcome, RequestKind};
static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"remote_storage_s3_requests_count",
"Number of s3 requests of particular type",
&["request_type"],
)
.expect("failed to define a metric")
});
static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"remote_storage_s3_failures_count",
"Number of failed s3 requests of particular type",
&["request_type"],
)
.expect("failed to define a metric")
});
pub fn inc_get_object() {
S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
}
pub fn inc_get_object_fail() {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["get_object"])
.inc();
}
pub fn inc_put_object() {
S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
}
pub fn inc_put_object_fail() {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["put_object"])
.inc();
}
pub fn inc_delete_object() {
S3_REQUESTS_COUNT
.with_label_values(&["delete_object"])
.inc();
}
pub fn inc_delete_objects(count: u64) {
S3_REQUESTS_COUNT
.with_label_values(&["delete_object"])
.inc_by(count);
}
pub fn inc_delete_object_fail() {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["delete_object"])
.inc();
}
pub fn inc_delete_objects_fail(count: u64) {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["delete_object"])
.inc_by(count);
}
pub fn inc_list_objects() {
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
}
pub fn inc_list_objects_fail() {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["list_objects"])
.inc();
}
}
/// AWS S3 storage.
pub struct S3Bucket {
@@ -68,29 +139,18 @@ impl S3Bucket {
aws_config.bucket_name
);
let region = Some(Region::new(aws_config.bucket_region.clone()));
let credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try(
"env",
EnvironmentVariableCredentialsProvider::new(),
)
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// needed to access remote extensions bucket
.or_else("token", {
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build()
})
// uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
let mut config_builder = Config::builder()
.region(region)
.region(Region::new(aws_config.bucket_region.clone()))
.credentials_cache(CredentialsCache::lazy())
.credentials_provider(credentials_provider);
@@ -140,56 +200,25 @@ impl S3Bucket {
)
}
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
let path_string = path
.get_path()
.to_string_lossy()
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
.to_string();
match &self.prefix_in_bucket {
Some(prefix) => prefix.clone() + "/" + &path_string,
None => path_string,
fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
for segment in path.0.iter() {
full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
full_path.push_str(segment.to_str().unwrap_or_default());
}
full_path
}
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
let started_at = start_counting_cancelled_wait(kind);
let permit = self
.concurrency_limiter
.acquire()
.await
.expect("semaphore is never closed");
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
permit
}
async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
let started_at = start_counting_cancelled_wait(kind);
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
let permit = self
.concurrency_limiter
.clone()
.acquire_owned()
.await
.expect("semaphore is never closed");
.context("Concurrency limiter semaphore got closed during S3 download")
.map_err(DownloadError::Other)?;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.wait_seconds
.observe_elapsed(kind, started_at);
permit
}
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
let kind = RequestKind::Get;
let permit = self.owned_permit(kind).await;
let started_at = start_measuring_requests(kind);
metrics::inc_get_object();
let get_object = self
.client
@@ -200,33 +229,26 @@ impl S3Bucket {
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
if get_object.is_err() {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Err,
started_at,
);
}
match get_object {
Ok(object_output) => {
let metadata = object_output.metadata().cloned().map(StorageMetadata);
Ok(Download {
metadata,
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
started_at,
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
permit,
object_output.body.into_async_read(),
))),
})
}
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
Err(DownloadError::NotFound)
}
Err(e) => Err(DownloadError::Other(
anyhow::Error::new(e).context("download s3 object"),
)),
Err(e) => {
metrics::inc_get_object_fail();
Err(DownloadError::Other(anyhow::anyhow!(
"Failed to download S3 object: {e}"
)))
}
}
}
}
@@ -257,54 +279,6 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
}
}
pin_project_lite::pin_project! {
/// Times and tracks the outcome of the request.
struct TimedDownload<S> {
started_at: std::time::Instant,
outcome: metrics::AttemptOutcome,
#[pin]
inner: S
}
impl<S> PinnedDrop for TimedDownload<S> {
fn drop(mut this: Pin<&mut Self>) {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
}
}
}
impl<S: AsyncRead> TimedDownload<S> {
fn new(started_at: std::time::Instant, inner: S) -> Self {
TimedDownload {
started_at,
outcome: metrics::AttemptOutcome::Cancelled,
inner,
}
}
}
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
fn poll_read(
self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut io::ReadBuf<'_>,
) -> std::task::Poll<std::io::Result<()>> {
let this = self.project();
let before = buf.filled().len();
let read = std::task::ready!(this.inner.poll_read(cx, buf));
let read_eof = buf.filled().len() == before;
match read {
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
Ok(()) => { /* still in progress */ }
Err(_) => *this.outcome = AttemptOutcome::Err,
}
std::task::Poll::Ready(read)
}
}
#[async_trait::async_trait]
impl RemoteStorage for S3Bucket {
/// See the doc for `RemoteStorage::list_prefixes`
@@ -313,8 +287,6 @@ impl RemoteStorage for S3Bucket {
&self,
prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError> {
let kind = RequestKind::List;
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
@@ -331,10 +303,15 @@ impl RemoteStorage for S3Bucket {
let mut document_keys = Vec::new();
let mut continuation_token = None;
loop {
let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind);
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 list")
.map_err(DownloadError::Other)?;
metrics::inc_list_objects();
let fetch_response = self
.client
@@ -346,16 +323,12 @@ impl RemoteStorage for S3Bucket {
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.map_err(|e| {
metrics::inc_list_objects_fail();
e
})
.context("Failed to list S3 prefixes")
.map_err(DownloadError::Other);
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &fetch_response, started_at);
let fetch_response = fetch_response?;
.map_err(DownloadError::Other)?;
document_keys.extend(
fetch_response
@@ -365,10 +338,10 @@ impl RemoteStorage for S3Bucket {
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
);
continuation_token = match fetch_response.next_continuation_token {
Some(new_token) => Some(new_token),
match fetch_response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
};
}
}
Ok(document_keys)
@@ -376,8 +349,6 @@ impl RemoteStorage for S3Bucket {
/// See the doc for `RemoteStorage::list_files`
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let kind = RequestKind::List;
let folder_name = folder
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
@@ -386,8 +357,12 @@ impl RemoteStorage for S3Bucket {
let mut continuation_token = None;
let mut all_files = vec![];
loop {
let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind);
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 list_files")?;
metrics::inc_list_objects();
let response = self
.client
@@ -398,14 +373,11 @@ impl RemoteStorage for S3Bucket {
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.context("Failed to list files in S3 bucket");
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &response, started_at);
let response = response?;
.map_err(|e| {
metrics::inc_list_objects_fail();
e
})
.context("Failed to list files in S3 bucket")?;
for object in response.contents().unwrap_or_default() {
let object_path = object.key().expect("response does not contain a key");
@@ -427,16 +399,18 @@ impl RemoteStorage for S3Bucket {
to: &RemotePath,
metadata: Option<StorageMetadata>,
) -> anyhow::Result<()> {
let kind = RequestKind::Put;
let _guard = self.permit(kind).await;
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 upload")?;
let started_at = start_measuring_requests(kind);
metrics::inc_put_object();
let body = Body::wrap_stream(ReaderStream::new(from));
let bytes_stream = ByteStream::new(SdkBody::from(body));
let res = self
.client
self.client
.put_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
@@ -444,25 +418,19 @@ impl RemoteStorage for S3Bucket {
.content_length(from_size_bytes.try_into()?)
.body(bytes_stream)
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res?;
.await
.map_err(|e| {
metrics::inc_put_object_fail();
e
})?;
Ok(())
}
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
// if prefix is not none then download file `prefix/from`
// if prefix is none then download file `from`
self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
range: None,
..GetObjectRequest::default()
})
.await
}
@@ -489,8 +457,11 @@ impl RemoteStorage for S3Bucket {
.await
}
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
let kind = RequestKind::Delete;
let _guard = self.permit(kind).await;
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 delete")?;
let mut delete_objects = Vec::with_capacity(paths.len());
for path in paths {
@@ -501,7 +472,7 @@ impl RemoteStorage for S3Bucket {
}
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
let started_at = start_measuring_requests(kind);
metrics::inc_delete_objects(chunk.len() as u64);
let resp = self
.client
@@ -511,17 +482,10 @@ impl RemoteStorage for S3Bucket {
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &resp, started_at);
match resp {
Ok(resp) => {
metrics::BUCKET_METRICS
.deleted_objects_total
.inc_by(chunk.len() as u64);
if let Some(errors) = resp.errors {
metrics::inc_delete_objects_fail(errors.len() as u64);
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
@@ -529,6 +493,7 @@ impl RemoteStorage for S3Bucket {
}
}
Err(e) => {
metrics::inc_delete_objects_fail(chunk.len() as u64);
return Err(e.into());
}
}
@@ -537,89 +502,24 @@ impl RemoteStorage for S3Bucket {
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let paths = std::array::from_ref(path);
self.delete_objects(paths).await
}
}
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
fn start_counting_cancelled_wait(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
})
}
/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
fn start_measuring_requests(
kind: RequestKind,
) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Cancelled,
started_at,
)
})
}
#[cfg(test)]
mod tests {
use std::num::NonZeroUsize;
use std::path::Path;
use crate::{RemotePath, S3Bucket, S3Config};
#[test]
fn relative_path() {
let all_paths = vec!["", "some/path", "some/path/"];
let all_paths: Vec<RemotePath> = all_paths
.iter()
.map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
.collect();
let prefixes = [
None,
Some(""),
Some("test/prefix"),
Some("test/prefix/"),
Some("/test/prefix/"),
];
let expected_outputs = vec![
vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path"],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
];
for (prefix_idx, prefix) in prefixes.iter().enumerate() {
let config = S3Config {
bucket_name: "bucket".to_owned(),
bucket_region: "region".to_owned(),
prefix_in_bucket: prefix.map(str::to_string),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(5),
};
let storage = S3Bucket::new(&config).expect("remote storage init");
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
let result = storage.relative_path_to_s3_object(test_path);
let expected = expected_outputs[prefix_idx][test_path_idx];
assert_eq!(result, expected);
}
}
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 delete")?;
metrics::inc_delete_object();
self.client
.delete_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(path))
.send()
.await
.map_err(|e| {
metrics::inc_delete_object_fail();
e
})?;
Ok(())
}
}

View File

@@ -1,191 +0,0 @@
use metrics::{
register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
};
use once_cell::sync::Lazy;
pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
#[derive(Clone, Copy, Debug)]
pub(super) enum RequestKind {
Get = 0,
Put = 1,
Delete = 2,
List = 3,
}
use RequestKind::*;
impl RequestKind {
const fn as_str(&self) -> &'static str {
match self {
Get => "get_object",
Put => "put_object",
Delete => "delete_object",
List => "list_objects",
}
}
const fn as_index(&self) -> usize {
*self as usize
}
}
pub(super) struct RequestTyped<C>([C; 4]);
impl<C> RequestTyped<C> {
pub(super) fn get(&self, kind: RequestKind) -> &C {
&self.0[kind.as_index()]
}
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
use RequestKind::*;
let mut it = [Get, Put, Delete, List].into_iter();
let arr = std::array::from_fn::<C, 4, _>(|index| {
let next = it.next().unwrap();
assert_eq!(index, next.as_index());
f(next)
});
if let Some(next) = it.next() {
panic!("unexpected {next:?}");
}
RequestTyped(arr)
}
}
impl RequestTyped<Histogram> {
pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
self.get(kind).observe(started_at.elapsed().as_secs_f64())
}
}
pub(super) struct PassFailCancelledRequestTyped<C> {
success: RequestTyped<C>,
fail: RequestTyped<C>,
cancelled: RequestTyped<C>,
}
#[derive(Debug, Clone, Copy)]
pub(super) enum AttemptOutcome {
Ok,
Err,
Cancelled,
}
impl<T, E> From<&Result<T, E>> for AttemptOutcome {
fn from(value: &Result<T, E>) -> Self {
match value {
Ok(_) => AttemptOutcome::Ok,
Err(_) => AttemptOutcome::Err,
}
}
}
impl AttemptOutcome {
pub(super) fn as_str(&self) -> &'static str {
match self {
AttemptOutcome::Ok => "ok",
AttemptOutcome::Err => "err",
AttemptOutcome::Cancelled => "cancelled",
}
}
}
impl<C> PassFailCancelledRequestTyped<C> {
pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
let target = match outcome {
AttemptOutcome::Ok => &self.success,
AttemptOutcome::Err => &self.fail,
AttemptOutcome::Cancelled => &self.cancelled,
};
target.get(kind)
}
fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
PassFailCancelledRequestTyped {
success,
fail,
cancelled,
}
}
}
impl PassFailCancelledRequestTyped<Histogram> {
pub(super) fn observe_elapsed(
&self,
kind: RequestKind,
outcome: impl Into<AttemptOutcome>,
started_at: std::time::Instant,
) {
self.get(kind, outcome.into())
.observe(started_at.elapsed().as_secs_f64())
}
}
pub(super) struct BucketMetrics {
/// Full request duration until successful completion, error or cancellation.
pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
/// Total amount of seconds waited on queue.
pub(super) wait_seconds: RequestTyped<Histogram>,
/// Track how many semaphore awaits were cancelled per request type.
///
/// This is in case cancellations are happening more than expected.
pub(super) cancelled_waits: RequestTyped<IntCounter>,
/// Total amount of deleted objects in batches or single requests.
pub(super) deleted_objects_total: IntCounter,
}
impl Default for BucketMetrics {
fn default() -> Self {
let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
let req_seconds = register_histogram_vec!(
"remote_storage_s3_request_seconds",
"Seconds to complete a request",
&["request_type", "result"],
buckets.to_vec(),
)
.unwrap();
let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
});
let wait_seconds = register_histogram_vec!(
"remote_storage_s3_wait_seconds",
"Seconds rate limited",
&["request_type"],
buckets.to_vec(),
)
.unwrap();
let wait_seconds =
RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
let cancelled_waits = register_int_counter_vec!(
"remote_storage_s3_cancelled_waits_total",
"Times a semaphore wait has been cancelled per request type",
&["request_type"],
)
.unwrap();
let cancelled_waits =
RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
let deleted_objects_total = register_int_counter!(
"remote_storage_s3_deleted_objects_total",
"Amount of deleted objects in total",
)
.unwrap();
Self {
req_seconds,
wait_seconds,
cancelled_waits,
deleted_objects_total,
}
}
}

View File

@@ -71,13 +71,6 @@ impl UnreliableWrapper {
}
}
}
async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
if attempt {
self.attempt(RemoteOp::Delete(path.clone()))?;
}
self.inner.delete(path).await
}
}
#[async_trait::async_trait]
@@ -129,15 +122,15 @@ impl RemoteStorage for UnreliableWrapper {
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
self.delete_inner(path, true).await
self.attempt(RemoteOp::Delete(path.clone()))?;
self.inner.delete(path).await
}
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
let mut error_counter = 0;
for path in paths {
// Dont record attempt because it was already recorded above
if (self.delete_inner(path, false).await).is_err() {
if (self.delete(path).await).is_err() {
error_counter += 1;
}
}

View File

@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
const BASE_PREFIX: &str = "test";
const BASE_PREFIX: &str = "test/";
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.

View File

@@ -1,188 +0,0 @@
use std::fmt::{Debug, Display};
use futures::Future;
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
let backoff_duration_seconds =
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
if backoff_duration_seconds > 0.0 {
tracing::info!(
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
);
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
}
}
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
if n == 0 {
0.0
} else {
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
}
}
/// retries passed operation until one of the following conditions are met:
/// Encountered error is considered as permanent (non-retryable)
/// Retries have been exhausted.
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
/// When attempts cross `warn_threshold` function starts to emit log warnings.
/// `description` argument is added to log messages. Its value should identify the `op` is doing
pub async fn retry<T, O, F, E>(
mut op: O,
is_permanent: impl Fn(&E) -> bool,
warn_threshold: u32,
max_retries: u32,
description: &str,
) -> Result<T, E>
where
// Not std::error::Error because anyhow::Error doesnt implement it.
// For context see https://github.com/dtolnay/anyhow/issues/63
E: Display + Debug,
O: FnMut() -> F,
F: Future<Output = Result<T, E>>,
{
let mut attempts = 0;
loop {
let result = op().await;
match result {
Ok(_) => {
if attempts > 0 {
tracing::info!("{description} succeeded after {attempts} retries");
}
return result;
}
// These are "permanent" errors that should not be retried.
Err(ref e) if is_permanent(e) => {
return result;
}
// Assume that any other failure might be transient, and the operation might
// succeed if we just keep trying.
Err(err) if attempts < warn_threshold => {
tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(err) if attempts < max_retries => {
tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(ref err) => {
// Operation failed `max_attempts` times. Time to give up.
tracing::warn!(
"{description} still failed after {attempts} retries, giving up: {err:?}"
);
return result;
}
}
// sleep and retry
exponential_backoff(
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
)
.await;
attempts += 1;
}
}
#[cfg(test)]
mod tests {
use std::io;
use tokio::sync::Mutex;
use super::*;
#[test]
fn backoff_defaults_produce_growing_backoff_sequence() {
let mut current_backoff_value = None;
for i in 0..10_000 {
let new_backoff_value = exponential_backoff_duration_seconds(
i,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
);
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
assert!(
old_backoff_value <= new_backoff_value,
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
)
}
}
assert_eq!(
current_backoff_value.expect("Should have produced backoff values to compare"),
DEFAULT_MAX_BACKOFF_SECONDS,
"Given big enough of retries, backoff should reach its allowed max value"
);
}
#[tokio::test(start_paused = true)]
async fn retry_always_error() {
let count = Mutex::new(0);
let err_result = retry(
|| async {
*count.lock().await += 1;
Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
},
|_e| false,
1,
1,
"work",
)
.await;
assert!(err_result.is_err());
assert_eq!(*count.lock().await, 2);
}
#[tokio::test(start_paused = true)]
async fn retry_ok_after_err() {
let count = Mutex::new(0);
retry(
|| async {
let mut locked = count.lock().await;
if *locked > 1 {
Ok(())
} else {
*locked += 1;
Err(io::Error::from(io::ErrorKind::Other))
}
},
|_e| false,
2,
2,
"work",
)
.await
.unwrap();
}
#[tokio::test(start_paused = true)]
async fn dont_retry_permanent_errors() {
let count = Mutex::new(0);
let _ = retry(
|| async {
let mut locked = count.lock().await;
if *locked > 1 {
Ok(())
} else {
*locked += 1;
Err(io::Error::from(io::ErrorKind::Other))
}
},
|_e| true,
2,
2,
"work",
)
.await
.unwrap_err();
assert_eq!(*count.lock().await, 1);
}
}

View File

@@ -111,10 +111,6 @@ pub fn fsync(path: &Path) -> io::Result<()> {
.map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
}
pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
tokio::fs::File::open(path).await?.sync_all().await
}
#[cfg(test)]
mod tests {
use tempfile::tempdir;

View File

@@ -24,42 +24,11 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
Ok(dir.next_entry().await?.is_none())
}
pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
let mut dir = tokio::fs::read_dir(&path)
.await
.context(format!("read_dir({})", path.as_ref().display()))?;
let mut content = vec![];
while let Some(next) = dir.next_entry().await? {
let file_name = next.file_name();
content.push(file_name.to_string_lossy().to_string());
}
Ok(content)
}
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
if e.kind() == io::ErrorKind::NotFound {
Ok(())
} else {
Err(e)
}
}
pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
where
F: Fn() -> io::Result<()>,
{
fs_operation().or_else(ignore_not_found)
}
#[cfg(test)]
mod test {
use std::path::PathBuf;
use crate::fs_ext::{is_directory_empty, list_dir};
use super::ignore_absent_files;
use crate::fs_ext::is_directory_empty;
#[test]
fn is_empty_dir() {
@@ -106,42 +75,4 @@ mod test {
std::fs::remove_file(&file_path).unwrap();
assert!(is_directory_empty(file_path).await.is_err());
}
#[test]
fn ignore_absent_files_works() {
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
let file_path: PathBuf = dir_path.join("testfile");
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
let f = std::fs::File::create(&file_path).unwrap();
drop(f);
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
assert!(!file_path.exists());
}
#[tokio::test]
async fn list_dir_works() {
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
assert!(list_dir(dir_path).await.unwrap().is_empty());
let file_path: PathBuf = dir_path.join("testfile");
let _ = std::fs::File::create(&file_path).unwrap();
assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
let another_dir_path: PathBuf = dir_path.join("testdir");
std::fs::create_dir(another_dir_path).unwrap();
let expected = &["testdir", "testfile"];
let mut actual = list_dir(dir_path).await.unwrap();
actual.sort();
assert_eq!(actual, expected);
}
}

View File

@@ -1,7 +1,5 @@
use std::ffi::OsStr;
use std::{fmt, str::FromStr};
use anyhow::Context;
use hex::FromHex;
use rand::Rng;
use serde::{Deserialize, Serialize};
@@ -215,18 +213,6 @@ pub struct TimelineId(Id);
id_newtype!(TimelineId);
impl TryFrom<Option<&OsStr>> for TimelineId {
type Error = anyhow::Error;
fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
value
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.with_context(|| format!("Could not parse timeline id from {:?}", value))
}
}
/// Neon Tenant Id represents identifiar of a particular tenant.
/// Is used for distinguishing requests and data belonging to different users.
///

View File

@@ -1,8 +1,6 @@
//! `utils` is intended to be a place to put code that is shared
//! between other crates in this repository.
pub mod backoff;
/// `Lsn` type implements common tasks on Log Sequence Numbers
pub mod lsn;
/// SeqWait allows waiting for a future sequence number to arrive
@@ -68,6 +66,44 @@ pub mod completion;
/// Reporting utilities
pub mod error;
mod failpoint_macro_helpers {
/// use with fail::cfg("$name", "return(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
/// specified time (in milliseconds). The main difference is that we use async
/// tokio sleep function. Another difference is that we print lines to the log,
/// which can be useful in tests to check that the failpoint was hit.
#[macro_export]
macro_rules! failpoint_sleep_millis_async {
($name:literal) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_sleep_helper($name, duration_str).await
}
}};
}
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", name);
}
}
pub use failpoint_macro_helpers::failpoint_sleep_helper;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -35,8 +35,6 @@ humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
num_cpus = { version = "1.15" }
num-traits.workspace = true
once_cell.workspace = true
pin-project-lite.workspace = true

View File

@@ -13,7 +13,6 @@ clap = { workspace = true, features = ["string"] }
git-version.workspace = true
pageserver = { path = ".." }
postgres_ffi.workspace = true
tokio.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true

View File

@@ -23,7 +23,6 @@
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
use anyhow::Result;
use pageserver::repository::Key;
use pageserver::METADATA_FILE_NAME;
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::path::PathBuf;
@@ -72,10 +71,6 @@ pub fn main() -> Result<()> {
let line = PathBuf::from_str(&line).unwrap();
let filename = line.file_name().unwrap();
let filename = filename.to_str().unwrap();
if filename == METADATA_FILE_NAME {
// Don't try and parse "metadata" like a key-lsn range
continue;
}
let range = parse_filename(filename);
ranges.push(range);
}

View File

@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
}
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
let file = FileBlockReader::new(VirtualFile::open(path)?);
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -107,31 +107,29 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
// min-heap (reserve space for one more element added before eviction)
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
let mut prev_key: Option<Key> = None;
tree_reader
.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, _value| {
let curr = Key::from_slice(&key[..KEY_SIZE]);
if let Some(prev) = prev_key {
if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
heap.push(Hole(prev..curr));
if heap.len() > max_holes {
heap.pop(); // remove smallest hole
}
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, _value| {
let curr = Key::from_slice(&key[..KEY_SIZE]);
if let Some(prev) = prev_key {
if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
heap.push(Hole(prev..curr));
if heap.len() > max_holes {
heap.pop(); // remove smallest hole
}
}
prev_key = Some(curr.next());
true
},
)
.await?;
}
prev_key = Some(curr.next());
true
},
)?;
let mut holes = heap.into_vec();
holes.sort_by_key(|hole| hole.0.start);
Ok(holes)
}
pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let storage_path = &cmd.path;
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
@@ -162,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
parse_filename(&layer.file_name().into_string().unwrap())
{
if layer_file.is_delta {
layer_file.holes = get_holes(&layer.path(), max_holes).await?;
layer_file.holes = get_holes(&layer.path(), max_holes)?;
n_deltas += 1;
}
layers.push(layer_file);

View File

@@ -43,7 +43,8 @@ pub(crate) enum LayerCmd {
},
}
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
use pageserver::tenant::blob_io::BlobCursor;
use pageserver::tenant::block_io::BlockReader;
let path = path.as_ref();
@@ -59,27 +60,25 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
);
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
let mut all = vec![];
tree_reader
.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, value_offset| {
let curr = Key::from_slice(&key[..KEY_SIZE]);
all.push((curr, BlobRef(value_offset)));
true
},
)
.await?;
let cursor = BlockCursor::new(&file);
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, value_offset| {
let curr = Key::from_slice(&key[..KEY_SIZE]);
all.push((curr, BlobRef(value_offset)));
true
},
)?;
let mut cursor = BlockCursor::new(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos()).await?;
let value = cursor.read_blob(v.pos())?;
println!("key:{} value_len:{}", k, value.len());
}
// TODO(chi): special handling for last key?
Ok(())
}
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
match cmd {
LayerCmd::List { path } => {
for tenant in fs::read_dir(path.join("tenants"))? {
@@ -154,7 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
);
if layer_file.is_delta {
read_delta_file(layer.path()).await?;
read_delta_file(layer.path())?;
} else {
anyhow::bail!("not supported yet :(");
}

View File

@@ -72,13 +72,12 @@ struct AnalyzeLayerMapCmd {
max_holes: Option<usize>,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
fn main() -> anyhow::Result<()> {
let cli = CliOpts::parse();
match cli.command {
Commands::Layer(cmd) => {
layers::main(&cmd).await?;
layers::main(&cmd)?;
}
Commands::Metadata(cmd) => {
handle_metadata(&cmd)?;
@@ -87,7 +86,7 @@ async fn main() -> anyhow::Result<()> {
draw_timeline_dir::main()?;
}
Commands::AnalyzeLayerMap(cmd) => {
layer_map_analyzer::main(&cmd).await?;
layer_map_analyzer::main(&cmd)?;
}
Commands::PrintLayerFile(cmd) => {
if let Err(e) = read_pg_control_file(&cmd.path) {
@@ -95,7 +94,7 @@ async fn main() -> anyhow::Result<()> {
"Failed to read input file as a pg control one: {e:#}\n\
Attempting to read it as layer file"
);
print_layerfile(&cmd.path).await?;
print_layerfile(&cmd.path)?;
}
}
};
@@ -114,12 +113,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
Ok(())
}
async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
// Basic initialization of things that don't change after startup
virtual_file::init(10);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
dump_layerfile_from_path(path, true, &ctx).await
dump_layerfile_from_path(path, true, &ctx)
}
fn handle_metadata(

View File

@@ -6,13 +6,11 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;
use metrics::set_build_info_metric;
@@ -40,6 +38,8 @@ const PID_FILE_NAME: &str = "pageserver.pid";
const FEATURES: &[&str] = &[
#[cfg(feature = "testing")]
"testing",
#[cfg(feature = "fail/failpoints")]
"fail/failpoints",
];
fn version() -> String {
@@ -121,7 +121,7 @@ fn main() -> anyhow::Result<()> {
}
// Initialize up failpoints support
let scenario = pageserver::failpoint_support::init();
let scenario = FailScenario::setup();
// Basic initialization of things that don't change after startup
virtual_file::init(conf.max_file_descriptors);
@@ -226,19 +226,6 @@ fn start_pageserver(
launch_ts: &'static LaunchTimestamp,
conf: &'static PageServerConf,
) -> anyhow::Result<()> {
// Monotonic time for later calculating startup duration
let started_startup_at = Instant::now();
let startup_checkpoint = move |phase: &str, human_phase: &str| {
let elapsed = started_startup_at.elapsed();
let secs = elapsed.as_secs_f64();
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
info!(
elapsed_ms = elapsed.as_millis(),
"{human_phase} ({secs:.3}s since start)"
)
};
// Print version and launch timestamp to the log,
// and expose them as prometheus metrics.
// A changed version string indicates changed software.
@@ -348,11 +335,6 @@ fn start_pageserver(
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial", "Starting loading tenants");
STARTUP_IS_LOADING.set(1);
// Startup staging or optimizing:
//
// We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -373,11 +355,12 @@ fn start_pageserver(
let order = pageserver::InitializationOrder {
initial_tenant_load: Some(init_done_tx),
initial_logical_size_can_start: init_done_rx.clone(),
initial_logical_size_attempt: Some(init_logical_size_done_tx),
initial_logical_size_attempt: init_logical_size_done_tx,
background_jobs_can_start: background_jobs_barrier.clone(),
};
// Scan the local 'tenants/' directory and start loading the tenants
let init_started_at = std::time::Instant::now();
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -395,13 +378,18 @@ fn start_pageserver(
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
init_done_rx.wait().await;
startup_checkpoint("initial_tenant_load", "Initial load completed");
STARTUP_IS_LOADING.set(0);
// initial logical sizes can now start, as they were waiting on init_done_rx.
scopeguard::ScopeGuard::into_inner(guard);
let init_done = std::time::Instant::now();
let elapsed = init_done - init_started_at;
tracing::info!(
elapsed_millis = elapsed.as_millis(),
"Initial load completed"
);
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
let timeout = conf.background_task_maximum_delay;
@@ -410,7 +398,12 @@ fn start_pageserver(
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
Ok(_) => {
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed"
);
None
}
Err(_) => {
@@ -426,7 +419,6 @@ fn start_pageserver(
// allow background jobs to start
drop(background_jobs_can_start);
startup_checkpoint("background_jobs_can_start", "Starting background jobs");
if let Some(init_sizes_done) = init_sizes_done {
// ending up here is not a bug; at the latest logical sizes will be queried by
@@ -436,11 +428,14 @@ fn start_pageserver(
scopeguard::ScopeGuard::into_inner(guard);
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed after timeout (background jobs already started)"
);
}
startup_checkpoint("complete", "Startup complete");
};
async move {

View File

@@ -31,12 +31,9 @@ use utils::{
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{
TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
};
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
TIMELINE_UNINIT_MARK_SUFFIX,
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
};
pub mod defaults {
@@ -604,22 +601,6 @@ impl PageServerConf {
)
}
pub fn timeline_delete_mark_file_path(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_DELETE_MARK_SUFFIX,
)
}
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_DELETED_MARKER_FILE_NAME)
}
pub fn traces_path(&self) -> PathBuf {
self.workdir.join("traces")
}

View File

@@ -7,23 +7,27 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::{mgr, LogicalSizeCalculationCause};
use anyhow;
use chrono::{DateTime, Utc};
use chrono::Utc;
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
use pageserver_api::models::TenantState;
use reqwest::Url;
use serde::Serialize;
use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use std::time::Duration;
use tracing::*;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
const WRITTEN_SIZE: &str = "written_size";
const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
const RESIDENT_SIZE: &str = "resident_size";
const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
#[serde_as]
#[derive(Serialize, Debug, Clone, Copy)]
#[derive(Serialize, Debug)]
struct Ids {
#[serde_as(as = "DisplayFromStr")]
tenant_id: TenantId,
@@ -34,142 +38,10 @@ struct Ids {
/// Key that uniquely identifies the object, this metric describes.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct MetricsKey {
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
metric: &'static str,
}
impl MetricsKey {
const fn absolute_values(self) -> AbsoluteValueFactory {
AbsoluteValueFactory(self)
}
const fn incremental_values(self) -> IncrementalValueFactory {
IncrementalValueFactory(self)
}
}
/// Helper type which each individual metric kind can return to produce only absolute values.
struct AbsoluteValueFactory(MetricsKey);
impl AbsoluteValueFactory {
fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
let key = self.0;
(key, (EventType::Absolute { time }, val))
}
}
/// Helper type which each individual metric kind can return to produce only incremental values.
struct IncrementalValueFactory(MetricsKey);
impl IncrementalValueFactory {
#[allow(clippy::wrong_self_convention)]
fn from_previous_up_to(
self,
prev_end: DateTime<Utc>,
up_to: DateTime<Utc>,
val: u64,
) -> (MetricsKey, (EventType, u64)) {
let key = self.0;
// cannot assert prev_end < up_to because these are realtime clock based
(
key,
(
EventType::Incremental {
start_time: prev_end,
stop_time: up_to,
},
val,
),
)
}
fn key(&self) -> &MetricsKey {
&self.0
}
}
// the static part of a MetricsKey
impl MetricsKey {
/// Absolute value of [`Timeline::get_last_record_lsn`].
///
/// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: "written_size",
}
.absolute_values()
}
/// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
/// previously sent, starting from the previously sent incremental time range ending at the
/// latest absolute measurement.
const fn written_size_delta(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> IncrementalValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
// the name here is correctly about data not size, because that is what is wanted by
// downstream pipeline
metric: "written_data_bytes_delta",
}
.incremental_values()
}
/// Exact [`Timeline::get_current_logical_size`].
///
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
const fn timeline_logical_size(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: "timeline_logical_size",
}
.absolute_values()
}
/// [`Tenant::remote_size`]
///
/// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: "remote_storage_size",
}
.absolute_values()
}
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
///
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: "resident_size",
}
.absolute_values()
}
/// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
///
/// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: "synthetic_storage_size",
}
.absolute_values()
}
pub struct PageserverConsumptionMetricsKey {
pub tenant_id: TenantId,
pub timeline_id: Option<TimelineId>,
pub metric: &'static str,
}
/// Main thread that serves metrics collection
@@ -207,7 +79,7 @@ pub async fn collect_metrics(
.timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
.build()
.expect("Failed to create http client with timeout");
let mut cached_metrics = HashMap::new();
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
loop {
@@ -247,15 +119,15 @@ pub async fn collect_metrics(
///
/// TODO
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
async fn collect_metrics_iteration(
pub async fn collect_metrics_iteration(
client: &reqwest::Client,
cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
metric_collection_endpoint: &reqwest::Url,
node_id: NodeId,
ctx: &RequestContext,
send_cached: bool,
) {
let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
trace!(
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
metric_collection_endpoint
@@ -289,65 +161,99 @@ async fn collect_metrics_iteration(
let mut tenant_resident_size = 0;
// iterate through list of timelines in tenant
for timeline in tenant.list_timelines() {
for timeline in tenant.list_timelines().iter() {
// collect per-timeline metrics only for active timelines
if timeline.is_active() {
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
let timeline_id = timeline.timeline_id;
match TimelineSnapshot::collect(&timeline, ctx) {
Ok(Some(snap)) => {
snap.to_metrics(
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id,
Utc::now(),
&mut current_metrics,
cached_metrics,
);
}
Ok(None) => {}
Err(e) => {
error!(
"failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
timeline.timeline_id
);
continue;
}
timeline_id: Some(timeline.timeline_id),
metric: WRITTEN_SIZE,
},
timeline_written_size,
));
let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
// Only send timeline logical size when it is fully calculated.
Ok((size, is_exact)) if is_exact => {
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: TIMELINE_LOGICAL_SIZE,
},
size,
));
}
Ok((_, _)) => {}
Err(err) => {
error!(
"failed to get current logical size for timeline {}: {err:?}",
timeline.timeline_id
);
continue;
}
};
}
tenant_resident_size += timeline.resident_physical_size();
let timeline_resident_size = timeline.get_resident_physical_size();
tenant_resident_size += timeline_resident_size;
}
current_metrics
.push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
match tenant.get_remote_size().await {
Ok(tenant_remote_size) => {
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: REMOTE_STORAGE_SIZE,
},
tenant_remote_size,
));
}
Err(err) => {
error!(
"failed to get remote size for tenant {}: {err:?}",
tenant_id
);
}
}
current_metrics
.push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: RESIDENT_SIZE,
},
tenant_resident_size,
));
// Note that this metric is calculated in a separate bgworker
// Here we only use cached value, which may lag behind the real latest one
let synthetic_size = tenant.cached_synthetic_size();
let tenant_synthetic_size = tenant.get_cached_synthetic_size();
if synthetic_size != 0 {
if tenant_synthetic_size != 0 {
// only send non-zeroes because otherwise these show up as errors in logs
current_metrics
.push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: SYNTHETIC_STORAGE_SIZE,
},
tenant_synthetic_size,
));
}
}
// Filter metrics, unless we want to send all metrics, including cached ones.
// See: https://github.com/neondatabase/neon/issues/3485
if !send_cached {
current_metrics.retain(|(curr_key, (kind, curr_val))| {
if kind.is_incremental() {
// incremental values (currently only written_size_delta) should not get any cache
// deduplication because they will be used by upstream for "is still alive."
true
} else {
match cached_metrics.get(curr_key) {
Some((_, val)) => val != curr_val,
None => true,
}
}
current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
Some(val) => val != curr_val,
None => true,
});
}
@@ -362,16 +268,14 @@ async fn collect_metrics_iteration(
let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);
let node_id = node_id.to_string();
for chunk in chunks {
chunk_to_send.clear();
// enrich metrics with type,timestamp and idempotency key before sending
chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
kind: *when,
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
kind: EventType::Absolute { time: Utc::now() },
metric: curr_key.metric,
idempotency_key: idempotency_key(&node_id),
idempotency_key: idempotency_key(node_id.to_string()),
value: *curr_val,
extra: Ids {
tenant_id: curr_key.tenant_id,
@@ -379,14 +283,17 @@ async fn collect_metrics_iteration(
},
}));
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
events: &chunk_to_send,
})
.expect("PageserverConsumptionMetric should not fail serialization");
const MAX_RETRIES: u32 = 3;
for attempt in 0..MAX_RETRIES {
let res = client
.post(metric_collection_endpoint.clone())
.json(&EventChunk {
events: (&chunk_to_send).into(),
})
.json(&chunk_json)
.send()
.await;
@@ -422,130 +329,6 @@ async fn collect_metrics_iteration(
}
}
/// Internal type to make timeline metric production testable.
///
/// As this value type contains all of the information needed from a timeline to produce the
/// metrics, it can easily be created with different values in test.
struct TimelineSnapshot {
loaded_at: (Lsn, SystemTime),
last_record_lsn: Lsn,
current_exact_logical_size: Option<u64>,
}
impl TimelineSnapshot {
/// Collect the metrics from an actual timeline.
///
/// Fails currently only when [`Timeline::get_current_logical_size`] fails.
///
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
fn collect(
t: &Arc<crate::tenant::Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<Option<Self>> {
use anyhow::Context;
if !t.is_active() {
// no collection for broken or stopping needed, we will still keep the cached values
// though at the caller.
Ok(None)
} else {
let loaded_at = t.loaded_at;
let last_record_lsn = t.get_last_record_lsn();
let current_exact_logical_size = {
let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
let res = span
.in_scope(|| t.get_current_logical_size(ctx))
.context("get_current_logical_size");
match res? {
// Only send timeline logical size when it is fully calculated.
(size, is_exact) if is_exact => Some(size),
(_, _) => None,
}
};
Ok(Some(TimelineSnapshot {
loaded_at,
last_record_lsn,
current_exact_logical_size,
}))
}
}
/// Produce the timeline consumption metrics into the `metrics` argument.
fn to_metrics(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
now: DateTime<Utc>,
metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
cache: &HashMap<MetricsKey, (EventType, u64)>,
) {
let timeline_written_size = u64::from(self.last_record_lsn);
let (key, written_size_now) =
MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
// last_record_lsn can only go up, right now at least, TODO: #2592 or related
// features might change this.
let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
// use this when available, because in a stream of incremental values, it will be
// accurate where as when last_record_lsn stops moving, we will only cache the last
// one of those.
let last_stop_time = cache
.get(written_size_delta_key.key())
.map(|(until, _val)| {
until
.incremental_timerange()
.expect("never create EventType::Absolute for written_size_delta")
.end
});
// by default, use the last sent written_size as the basis for
// calculating the delta. if we don't yet have one, use the load time value.
let prev = cache
.get(&key)
.map(|(prev_at, prev)| {
// use the prev time from our last incremental update, or default to latest
// absolute update on the first round.
let prev_at = prev_at
.absolute_time()
.expect("never create EventType::Incremental for written_size");
let prev_at = last_stop_time.unwrap_or(prev_at);
(*prev_at, *prev)
})
.unwrap_or_else(|| {
// if we don't have a previous point of comparison, compare to the load time
// lsn.
let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
(DateTime::from(*loaded_at), disk_consistent_lsn.0)
});
// written_size_bytes_delta
metrics.extend(
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
let up_to = written_size_now
.0
.absolute_time()
.expect("never create EventType::Incremental for written_size");
let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
Some(key_value)
} else {
None
},
);
// written_size
metrics.push((key, written_size_now));
if let Some(size) = self.current_exact_logical_size {
metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
}
}
}
/// Caclculate synthetic size for each active tenant
pub async fn calculate_synthetic_size_worker(
synthetic_size_calculation_interval: Duration,
@@ -560,7 +343,7 @@ pub async fn calculate_synthetic_size_worker(
_ = task_mgr::shutdown_watcher() => {
return Ok(());
},
tick_at = ticker.tick() => {
tick_at = ticker.tick() => {
let tenants = match mgr::list_tenants().await {
Ok(tenants) => tenants,
@@ -596,149 +379,3 @@ pub async fn calculate_synthetic_size_worker(
}
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::time::SystemTime;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::consumption_metrics::MetricsKey;
use super::TimelineSnapshot;
use chrono::{DateTime, Utc};
#[test]
fn startup_collected_timeline_metrics_before_advancing() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let mut metrics = Vec::new();
let cache = HashMap::new();
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, SystemTime::now()),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
let now = DateTime::<Utc>::from(SystemTime::now());
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
snap.loaded_at.1.into(),
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
#[test]
fn startup_collected_timeline_metrics_second_round() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [now, before, init] = time_backwards();
let now = DateTime::<Utc>::from(now);
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let mut metrics = Vec::new();
let cache = HashMap::from([
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
]);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_previous_up_to(before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
#[test]
fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [now, just_before, before, init] = time_backwards();
let now = DateTime::<Utc>::from(now);
let just_before = DateTime::<Utc>::from(just_before);
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let mut metrics = Vec::new();
let cache = HashMap::from([
// at t=before was the last time the last_record_lsn changed
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
// end time of this event is used for the next ones
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
before,
just_before,
0,
),
]);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
just_before,
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
times[0] = std::time::SystemTime::now();
for behind in 1..N {
times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
}
times
}
}

View File

@@ -85,7 +85,6 @@
//! The solution is that all code paths are infected with precisely one
//! [`RequestContext`] argument. Functions in the middle of the call chain
//! only need to pass it on.
use crate::task_mgr::TaskKind;
// The main structure of this module, see module-level comment.
@@ -93,7 +92,6 @@ use crate::task_mgr::TaskKind;
pub struct RequestContext {
task_kind: TaskKind,
download_behavior: DownloadBehavior,
access_stats_behavior: AccessStatsBehavior,
}
/// Desired behavior if the operation requires an on-demand download
@@ -111,67 +109,6 @@ pub enum DownloadBehavior {
Error,
}
/// Whether this request should update access times used in LRU eviction
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub(crate) enum AccessStatsBehavior {
/// Update access times: this request's access to data should be taken
/// as a hint that the accessed layer is likely to be accessed again
Update,
/// Do not update access times: this request is accessing the layer
/// but does not want to indicate that the layer should be retained in cache,
/// perhaps because the requestor is a compaction routine that will soon cover
/// this layer with another.
Skip,
}
pub struct RequestContextBuilder {
inner: RequestContext,
}
impl RequestContextBuilder {
/// A new builder with default settings
pub fn new(task_kind: TaskKind) -> Self {
Self {
inner: RequestContext {
task_kind,
download_behavior: DownloadBehavior::Download,
access_stats_behavior: AccessStatsBehavior::Update,
},
}
}
pub fn extend(original: &RequestContext) -> Self {
Self {
// This is like a Copy, but avoid implementing Copy because ordinary users of
// RequestContext should always move or ref it.
inner: RequestContext {
task_kind: original.task_kind,
download_behavior: original.download_behavior,
access_stats_behavior: original.access_stats_behavior,
},
}
}
/// Configure the DownloadBehavior of the context: whether to
/// download missing layers, and/or warn on the download.
pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
self.inner.download_behavior = b;
self
}
/// Configure the AccessStatsBehavior of the context: whether layer
/// accesses should update the access time of the layer.
pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
self.inner.access_stats_behavior = b;
self
}
pub fn build(self) -> RequestContext {
self.inner
}
}
impl RequestContext {
/// Create a new RequestContext that has no parent.
///
@@ -186,9 +123,10 @@ impl RequestContext {
/// because someone explicitly canceled it.
/// It has no parent, so it cannot inherit cancellation from there.
pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
RequestContextBuilder::new(task_kind)
.download_behavior(download_behavior)
.build()
RequestContext {
task_kind,
download_behavior,
}
}
/// Create a detached child context for a task that may outlive `self`.
@@ -249,7 +187,10 @@ impl RequestContext {
}
fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
Self::new(task_kind, download_behavior)
RequestContext {
task_kind,
download_behavior,
}
}
pub fn task_kind(&self) -> TaskKind {
@@ -259,8 +200,4 @@ impl RequestContext {
pub fn download_behavior(&self) -> DownloadBehavior {
self.download_behavior
}
pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
self.access_stats_behavior
}
}

View File

@@ -304,18 +304,17 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// Debug-log the list of candidates
let now = SystemTime::now();
for (i, (partition, candidate)) in candidates.iter().enumerate() {
let desc = candidate.layer.layer_desc();
debug!(
"cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
i + 1,
candidates.len(),
desc.file_size,
candidate.layer.file_size(),
now.duration_since(candidate.last_activity_ts)
.unwrap()
.as_micros(),
partition,
desc.tenant_id,
desc.timeline_id,
candidate.layer.get_tenant_id(),
candidate.layer.get_timeline_id(),
candidate.layer,
);
}
@@ -347,7 +346,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
warned = Some(usage_planned);
}
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
usage_planned.add_available_bytes(candidate.layer.file_size());
batched
.entry(TimelineKey(candidate.timeline))
@@ -390,16 +389,15 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
Ok(results) => {
assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
let file_size = layer.layer_desc().file_size;
match result {
Some(Ok(())) => {
usage_assumed.add_available_bytes(file_size);
usage_assumed.add_available_bytes(layer.file_size());
}
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
}
Some(Err(EvictionError::FileNotFound)) => {
evictions_failed.file_sizes += file_size;
evictions_failed.file_sizes += layer.file_size();
evictions_failed.count += 1;
}
Some(Err(
@@ -408,7 +406,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
)) => {
let e = utils::error::report_compact_sources(&e);
warn!(%layer, "failed to evict layer: {e}");
evictions_failed.file_sizes += file_size;
evictions_failed.file_sizes += layer.file_size();
evictions_failed.count += 1;
}
None => {
@@ -547,12 +545,12 @@ async fn collect_eviction_candidates(
// We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
// That's what's typically used by the various background loops.
//
// The default can be overridden with a fixed value in the tenant conf.
// The default can be overriden with a fixed value in the tenant conf.
// A default override can be put in the default tenant conf in the pageserver.toml.
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
debug!(
tenant_id=%tenant.tenant_id(),
overridden_size=s,
overriden_size=s,
"using overridden min resident size for tenant"
);
s

View File

@@ -1,86 +0,0 @@
/// use with fail::cfg("$name", "return(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
/// specified time (in milliseconds). The main difference is that we use async
/// tokio sleep function. Another difference is that we print lines to the log,
/// which can be useful in tests to check that the failpoint was hit.
#[macro_export]
macro_rules! __failpoint_sleep_millis_async {
($name:literal) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
}
}};
}
pub use __failpoint_sleep_millis_async as sleep_millis_async;
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
#[doc(hidden)]
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", name);
}
pub fn init() -> fail::FailScenario<'static> {
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
// We want non-default behavior for `exit`, though, so, we handle it separately.
//
// Format for FAILPOINTS is "name=actions" separated by ";".
let actions = std::env::var("FAILPOINTS");
if actions.is_ok() {
std::env::remove_var("FAILPOINTS");
} else {
// let the library handle non-utf8, or nothing for not present
}
let scenario = fail::FailScenario::setup();
if let Ok(val) = actions {
val.split(';')
.enumerate()
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
.for_each(|res| {
let (name, actions) = match res {
Ok(t) => t,
Err((i, s)) => {
panic!(
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
i + 1,
);
}
};
if let Err(e) = apply_failpoint(name, actions) {
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
}
});
}
scenario
}
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
if actions == "exit" {
fail::cfg_callback(name, exit_failpoint)
} else {
fail::cfg(name, actions)
}
}
#[inline(never)]
fn exit_failpoint() {
tracing::info!("Exit requested by failpoint");
std::process::exit(1);
}

View File

@@ -93,47 +93,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
delete:
description: |
Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
404 means that deletion successfully finished"
responses:
"400":
description: Error when no tenant id found in path
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"404":
description: Tenant not found
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"409":
description: Deletion is already in progress, continue polling
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline:
parameters:
@@ -861,7 +820,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/config:
put:
description: |

View File

@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
format!("Cannot delete timeline which has child timelines: {children:?}")
.into_boxed_str(),
),
a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
Other(e) => ApiError::InternalServerError(e),
}
}
@@ -208,19 +208,6 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
}
}
impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
use crate::tenant::delete::DeleteTenantError::*;
match value {
Get(g) => ApiError::from(g),
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
Timeline(t) => ApiError::from(t),
Other(o) => ApiError::InternalServerError(o),
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
}
}
}
// Helper function to construct a TimelineInfo struct for a timeline
async fn build_timeline_info(
timeline: &Arc<Timeline>,
@@ -517,6 +504,7 @@ async fn timeline_delete_handler(
.instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
.await?;
// FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
json_response(StatusCode::ACCEPTED, ())
}
@@ -629,23 +617,6 @@ async fn tenant_status(
json_response(StatusCode::OK, tenant_info)
}
async fn tenant_delete_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
// TODO openapi spec
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
.instrument(info_span!("tenant_delete_handler", %tenant_id))
.await?;
json_response(StatusCode::ACCEPTED, ())
}
/// HTTP endpoint to query the current tenant_size of a tenant.
///
/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -979,7 +950,14 @@ async fn failpoints_handler(
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
let cfg_result = if fp.actions == "exit" {
fail::cfg_callback(fp.name, || {
info!("Exit requested by failpoint");
std::process::exit(1);
})
} else {
fail::cfg(fp.name, &fp.actions)
};
if let Err(err_msg) = cfg_result {
return Err(ApiError::BadRequest(anyhow!(
@@ -1016,29 +994,31 @@ async fn timeline_gc_handler(
// Run compaction immediately on given timeline.
async fn timeline_compact_handler(
request: Request<Body>,
cancel: CancellationToken,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
timeline
.compact(&cancel, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
.await
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
.await
.context("spawn compaction task")
.map_err(ApiError::InternalServerError)?;
let result: anyhow::Result<()> = result_receiver
.await
.context("receive compaction result")
.map_err(ApiError::InternalServerError)?;
result.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
// Run checkpoint immediately on given timeline.
async fn timeline_checkpoint_handler(
request: Request<Body>,
cancel: CancellationToken,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1051,13 +1031,13 @@ async fn timeline_checkpoint_handler(
.await
.map_err(ApiError::InternalServerError)?;
timeline
.compact(&cancel, &ctx)
.compact(&ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
.await
}
@@ -1367,9 +1347,6 @@ pub fn make_router(
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
.delete("/v1/tenant/:tenant_id", |r| {
api_handler(r, tenant_delete_handler)
})
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
api_handler(r, tenant_size_handler)
})

View File

@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
pub mod metrics;
pub(crate) mod metrics;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
@@ -21,8 +21,6 @@ pub mod walingest;
pub mod walrecord;
pub mod walredo;
pub mod failpoint_support;
use std::path::Path;
use crate::task_mgr::TaskKind;
@@ -49,54 +47,50 @@ pub use crate::metrics::preinitialize_metrics;
#[tracing::instrument]
pub async fn shutdown_pageserver(exit_code: i32) {
use std::time::Duration;
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
timed(
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
"shutdown LibpqEndpointListener",
Duration::from_secs(1),
)
.await;
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
// Shut down any page service tasks.
timed(
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
"shutdown PageRequestHandlers",
Duration::from_secs(1),
)
.await;
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
// Shut down all the tenants. This flushes everything to disk and kills
// the checkpoint and GC tasks.
timed(
tenant::mgr::shutdown_all_tenants(),
"shutdown all tenants",
Duration::from_secs(5),
)
.await;
tenant::mgr::shutdown_all_tenants().await;
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.
timed(
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
"shutdown http",
Duration::from_secs(1),
)
.await;
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
// There should be nothing left, but let's be sure
timed(
task_mgr::shutdown_tasks(None, None, None),
"shutdown leftovers",
Duration::from_secs(1),
)
.await;
task_mgr::shutdown_tasks(None, None, None).await;
info!("Shut down successfully completed");
std::process::exit(exit_code);
}
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
let backoff_duration_seconds =
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
if backoff_duration_seconds > 0.0 {
info!(
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
);
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
}
}
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
if n == 0 {
0.0
} else {
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
}
}
/// The name of the metadata file pageserver creates per timeline.
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
pub const METADATA_FILE_NAME: &str = "metadata";
@@ -115,8 +109,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
/// A marker file to prevent pageserver from loading a certain tenant on restart.
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -131,30 +123,15 @@ pub fn is_temporary(path: &Path) -> bool {
}
}
fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
pub fn is_uninit_mark(path: &Path) -> bool {
match path.file_name() {
Some(name) => name.to_string_lossy().ends_with(suffix),
Some(name) => name
.to_string_lossy()
.ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
None => false,
}
}
pub fn is_uninit_mark(path: &Path) -> bool {
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
}
pub fn is_delete_mark(path: &Path) -> bool {
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
}
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
if let Some(e) = e.io_error() {
if e.kind() == std::io::ErrorKind::NotFound {
return true;
}
}
false
}
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
/// blocking.
///
@@ -170,7 +147,7 @@ pub struct InitializationOrder {
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
/// attempt. It is important to drop this once the attempt has completed.
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
pub initial_logical_size_attempt: utils::completion::Completion,
/// Barrier for when we can start any background jobs.
///
@@ -178,75 +155,33 @@ pub struct InitializationOrder {
pub background_jobs_can_start: utils::completion::Barrier,
}
/// Time the future with a warning when it exceeds a threshold.
async fn timed<Fut: std::future::Future>(
fut: Fut,
name: &str,
warn_at: std::time::Duration,
) -> <Fut as std::future::Future>::Output {
let started = std::time::Instant::now();
let mut fut = std::pin::pin!(fut);
match tokio::time::timeout(warn_at, &mut fut).await {
Ok(ret) => {
tracing::info!(
task = name,
elapsed_ms = started.elapsed().as_millis(),
"completed"
);
ret
}
Err(_) => {
tracing::info!(
task = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
let ret = fut.await;
// this has a global allowed_errors
tracing::warn!(
task = name,
elapsed_ms = started.elapsed().as_millis(),
"completed, took longer than expected"
);
ret
}
}
}
#[cfg(test)]
mod timed_tests {
use super::timed;
use std::time::Duration;
mod backoff_defaults_tests {
use super::*;
#[tokio::test]
async fn timed_completes_when_inner_future_completes() {
// A future that completes on time should have its result returned
let r1 = timed(
async move {
tokio::time::sleep(Duration::from_millis(10)).await;
123
},
"test 1",
Duration::from_millis(50),
)
.await;
assert_eq!(r1, 123);
#[test]
fn backoff_defaults_produce_growing_backoff_sequence() {
let mut current_backoff_value = None;
// A future that completes too slowly should also have its result returned
let r1 = timed(
async move {
tokio::time::sleep(Duration::from_millis(50)).await;
456
},
"test 1",
Duration::from_millis(10),
)
.await;
assert_eq!(r1, 456);
for i in 0..10_000 {
let new_backoff_value = exponential_backoff_duration_seconds(
i,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
);
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
assert!(
old_backoff_value <= new_backoff_value,
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
)
}
}
assert_eq!(
current_backoff_value.expect("Should have produced backoff values to compare"),
DEFAULT_MAX_BACKOFF_SECONDS,
"Given big enough of retries, backoff should reach its allowed max value"
);
}
}

View File

@@ -1,11 +1,12 @@
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::models::TenantState;
use strum::VariantNames;
use strum_macros::{EnumVariantNames, IntoStaticStr};
use utils::id::{TenantId, TimelineId};
@@ -73,7 +74,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
// Buckets for background operations like compaction, GC, size calculation
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_storage_operations_seconds_global",
"Time spent on storage operations",
@@ -83,17 +84,18 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_read_num_fs_layers",
"Number of persistent layers accessed for processing a read request, including those in the cache",
&["tenant_id", "timeline_id"],
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
)
.expect("failed to define a metric")
});
// Metrics collected on operations on the storage repository.
pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -102,7 +104,7 @@ pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_direct_total",
"Number of cache hits from materialized page cache without redo",
@@ -110,16 +112,17 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
.expect("failed to define a metric")
});
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_getpage_get_reconstruct_data_seconds",
"Time spent in get_reconstruct_value_data",
&["tenant_id", "timeline_id"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_total",
"Number of cache hits from materialized page cache",
@@ -243,10 +246,11 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
},
});
pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_wait_lsn_seconds",
"Time spent waiting for WAL to arrive",
&["tenant_id", "timeline_id"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
@@ -280,7 +284,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_remote_ondemand_downloaded_layers_total",
"Total on-demand downloaded layers"
@@ -288,7 +292,7 @@ pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::ne
.unwrap()
});
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_remote_ondemand_downloaded_bytes_total",
"Total bytes of layers on-demand downloaded",
@@ -305,29 +309,16 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define current logical size metric")
});
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_states_count",
"Count of tenants per state",
&["state"]
&["tenant_id", "state"]
)
.expect("Failed to register pageserver_tenant_states_count metric")
});
/// A set of broken tenants.
///
/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
/// tenant.
pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_broken_tenants_count",
"Set of broken tenants",
&["tenant_id"]
)
.expect("Failed to register pageserver_tenant_states_count metric")
});
pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_synthetic_cached_size_bytes",
"Synthetic size of each tenant in bytes",
@@ -385,7 +376,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
.expect("failed to define a metric")
});
pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_unexpected_ondemand_downloads_count",
"Number of unexpected on-demand downloads. \
@@ -394,35 +385,6 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
.expect("failed to define a metric")
});
/// How long did we take to start up? Broken down by labels to describe
/// different phases of startup.
pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
register_gauge_vec!(
"pageserver_startup_duration_seconds",
"Time taken by phases of pageserver startup, in seconds",
&["phase"]
)
.expect("Failed to register pageserver_startup_duration_seconds metric")
});
pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_startup_is_loading",
"1 while in initial startup load of tenants, 0 at other times"
)
.expect("Failed to register pageserver_startup_is_loading")
});
/// How long did tenants take to go from construction to active state?
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_tenant_activation_seconds",
"Time taken by tenants to activate, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register pageserver_tenant_activation_seconds metric")
});
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
#[derive(Debug)]
pub struct EvictionsWithLowResidenceDuration {
@@ -537,31 +499,23 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
30.000, // 30000 ms
];
/// Tracks time taken by fs operations near VirtualFile.
///
/// Operations:
/// - open ([`std::fs::OpenOptions::open`])
/// - close (dropping [`std::fs::File`])
/// - close-by-replace (close by replacement algorithm)
/// - read (`read_at`)
/// - write (`write_at`)
/// - seek (modify internal position or file length query)
/// - fsync ([`std::fs::File::sync_all`])
/// - metadata ([`std::fs::File::metadata`])
pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
"open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
];
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_io_operations_seconds",
"Time spent in IO operations",
&["operation"],
&["operation", "tenant_id", "timeline_id"],
STORAGE_IO_TIME_BUCKETS.into()
)
.expect("failed to define a metric")
});
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_io_operations_bytes_total",
"Total amount of bytes read/written in IO operations",
@@ -651,7 +605,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
at a given instant. It gives you a better idea of the queue depth \
than plotting the gauge directly, since operations may complete faster \
than the sampling interval.",
&["file_kind", "op_kind"],
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
)
@@ -708,18 +662,18 @@ impl RemoteOpFileKind {
}
}
pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_remote_operation_seconds",
"Time spent on remote storage operations. \
Grouped by tenant, timeline, operation_kind and status. \
Does not account for time spent waiting in remote timeline client's queues.",
&["file_kind", "op_kind", "status"]
&["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
)
.expect("failed to define a metric")
});
pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_task_events",
"Number of task start/stop/fail events.",
@@ -728,7 +682,7 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("Failed to register tenant_task_events metric")
});
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_period_overrun_count",
"Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -739,7 +693,7 @@ pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = La
// walreceiver metrics
pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_walreceiver_started_connections_total",
"Number of started walreceiver connections"
@@ -747,7 +701,7 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"pageserver_walreceiver_active_managers",
"Number of active walreceiver managers"
@@ -755,7 +709,7 @@ pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_walreceiver_switches_total",
"Number of walreceiver manager change_connection calls",
@@ -764,7 +718,7 @@ pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_walreceiver_broker_updates_total",
"Number of received broker updates in walreceiver"
@@ -772,7 +726,7 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_walreceiver_candidates_events_total",
"Number of walreceiver candidate events",
@@ -781,10 +735,10 @@ pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
// Metrics collected on WAL redo operations
@@ -831,7 +785,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
};
}
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_seconds",
"Time spent on WAL redo",
@@ -840,7 +794,7 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the Postgres WAL redo process",
@@ -849,7 +803,7 @@ pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_records_histogram",
"Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -858,7 +812,7 @@ pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_bytes_histogram",
"Histogram of number of records replayed per redo sent to Postgres",
@@ -867,8 +821,7 @@ pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_replayed_wal_records_total",
"Number of WAL records replayed in WAL redo process"
@@ -944,6 +897,7 @@ impl StorageTimeMetrics {
pub struct TimelineMetrics {
tenant_id: String,
timeline_id: String,
pub get_reconstruct_data_time_histo: Histogram,
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
pub create_images_time_histo: StorageTimeMetrics,
@@ -952,7 +906,9 @@ pub struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub resident_physical_size_gauge: UIntGauge,
pub read_num_fs_layers: Histogram,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
@@ -969,6 +925,9 @@ impl TimelineMetrics {
) -> Self {
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let flush_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
let compact_time_histo =
@@ -989,6 +948,9 @@ impl TimelineMetrics {
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let wait_lsn_time_histo = WAIT_LSN_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
@@ -1004,12 +966,16 @@ impl TimelineMetrics {
let evictions = EVICTIONS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let read_num_fs_layers = READ_NUM_FS_LAYERS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration =
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
TimelineMetrics {
tenant_id,
timeline_id,
get_reconstruct_data_time_histo,
flush_time_histo,
compact_time_histo,
create_images_time_histo,
@@ -1018,6 +984,7 @@ impl TimelineMetrics {
garbage_collect_histo,
load_layer_map_histo,
last_record_gauge,
wait_lsn_time_histo,
resident_physical_size_gauge,
current_logical_size_gauge,
num_persistent_files_created,
@@ -1026,6 +993,7 @@ impl TimelineMetrics {
evictions_with_low_residence_duration: std::sync::RwLock::new(
evictions_with_low_residence_duration,
),
read_num_fs_layers,
}
}
}
@@ -1034,12 +1002,15 @@ impl Drop for TimelineMetrics {
fn drop(&mut self) {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
self.evictions_with_low_residence_duration
.write()
@@ -1051,6 +1022,9 @@ impl Drop for TimelineMetrics {
let _ =
STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
}
for op in STORAGE_IO_TIME_OPERATIONS {
let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
}
for op in STORAGE_IO_SIZE_OPERATIONS {
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1065,7 +1039,9 @@ impl Drop for TimelineMetrics {
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
let tid = tenant_id.to_string();
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
// we leave the BROKEN_TENANTS_SET entry if any
for state in TenantState::VARIANTS {
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
}
}
use futures::Future;
@@ -1080,7 +1056,9 @@ pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
}
@@ -1090,13 +1068,14 @@ impl RemoteTimelineClientMetrics {
RemoteTimelineClientMetrics {
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
remote_operation_time: Mutex::new(HashMap::default()),
calls_unfinished_gauge: Mutex::new(HashMap::default()),
calls_started_hist: Mutex::new(HashMap::default()),
bytes_started_counter: Mutex::new(HashMap::default()),
bytes_finished_counter: Mutex::new(HashMap::default()),
remote_physical_size_gauge: Mutex::new(None),
}
}
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
guard
@@ -1110,17 +1089,26 @@ impl RemoteTimelineClientMetrics {
})
.clone()
}
pub fn remote_operation_time(
&self,
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
status: &'static str,
) -> Histogram {
let mut guard = self.remote_operation_time.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str(), status);
REMOTE_OPERATION_TIME
.get_metric_with_label_values(&[key.0, key.1, key.2])
.unwrap()
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_OPERATION_TIME
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
key.2,
])
.unwrap()
});
metric.clone()
}
fn calls_unfinished_gauge(
@@ -1148,10 +1136,19 @@ impl RemoteTimelineClientMetrics {
file_kind: &RemoteOpFileKind,
op_kind: &RemoteOpKind,
) -> Histogram {
let mut guard = self.calls_started_hist.lock().unwrap();
let key = (file_kind.as_str(), op_kind.as_str());
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
.get_metric_with_label_values(&[key.0, key.1])
.unwrap()
let metric = guard.entry(key).or_insert_with(move || {
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
.get_metric_with_label_values(&[
&self.tenant_id.to_string(),
&self.timeline_id.to_string(),
key.0,
key.1,
])
.unwrap()
});
metric.clone()
}
fn bytes_started_counter(
@@ -1331,10 +1328,15 @@ impl Drop for RemoteTimelineClientMetrics {
tenant_id,
timeline_id,
remote_physical_size_gauge,
remote_operation_time,
calls_unfinished_gauge,
calls_started_hist,
bytes_started_counter,
bytes_finished_counter,
} = self;
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
}
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
tenant_id,
@@ -1343,6 +1345,14 @@ impl Drop for RemoteTimelineClientMetrics {
b,
]);
}
for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
tenant_id,
timeline_id,
a,
b,
]);
}
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
tenant_id,
@@ -1424,51 +1434,15 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
}
pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting.
//
// FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
// order:
// - global metrics reside in a Lazy<PageserverMetrics>
// - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
// - could move the statics into TimelineMetrics::new()?
// We want to alert on this metric increasing.
// Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
// counters
[
&MATERIALIZED_PAGE_CACHE_HIT,
&MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
&UNEXPECTED_ONDEMAND_DOWNLOADS,
&WALRECEIVER_STARTED_CONNECTIONS,
&WALRECEIVER_BROKER_UPDATES,
&WALRECEIVER_CANDIDATES_ADDED,
&WALRECEIVER_CANDIDATES_REMOVED,
]
.into_iter()
.for_each(|c| {
Lazy::force(c);
});
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()
.for_each(|c| {
Lazy::force(c);
});
// gauges
WALRECEIVER_ACTIVE_MANAGERS.get();
// histograms
[
&READ_NUM_FS_LAYERS,
&RECONSTRUCT_TIME,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,
&WAL_REDO_WAIT_TIME,
&WAL_REDO_RECORDS_HISTOGRAM,
&WAL_REDO_BYTES_HISTOGRAM,
]
.into_iter()
.for_each(|h| {
Lazy::force(h);
});
// Python tests need these.
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
MATERIALIZED_PAGE_CACHE_HIT.get();
}

View File

@@ -10,42 +10,6 @@
//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
//! information about what's stored in the buffer.
//!
//! # Types Of Pages
//!
//! [`PageCache`] only supports immutable pages.
//! Hence there is no need to worry about coherency.
//!
//! Two types of pages are supported:
//!
//! * **Materialized pages**, filled & used by page reconstruction
//! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
//!
//! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
//! It uses the page cache only for the blocks that are already fully written and immutable.
//!
//! # Filling The Page Cache
//!
//! Page cache maps from a cache key to a buffer slot.
//! The cache key uniquely identifies the piece of data that is being cached.
//!
//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
//!
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
//! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
//! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
//! * Get a [`FileId`] using [`next_file_id`].
//! * Use the mechanism to associate the on-disk file with the returned [`FileId`].
//! * Use [`PageCache::read_immutable_buf`] to get a [`ReadBufResult`].
//! * If the page was already cached, it'll be the [`ReadBufResult::Found`] variant that contains
//! a read guard for the page. Just use it.
//! * If the page was not cached, it'll be the [`ReadBufResult::NotFound`] variant that contains
//! a write guard for the page. Fill the page with the contents of the on-disk file.
//! Then call [`PageWriteGuard::mark_valid`] to mark the page as valid.
//! Then try again to [`PageCache::read_immutable_buf`].
//! Unless there's high cache pressure, the page should now be cached.
//! (TODO: allow downgrading the write guard to a read guard to ensure forward progress.)
//!
//! # Locking
//!
//! There are two levels of locking involved: There's one lock for the "mapping"
@@ -76,18 +40,20 @@ use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
sync::{
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
atomic::{AtomicU8, AtomicUsize, Ordering},
RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
},
};
use anyhow::Context;
use once_cell::sync::OnceCell;
use tracing::error;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::tenant::writeback_ephemeral_file;
use crate::{metrics::PageCacheSizeMetrics, repository::Key};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -121,17 +87,6 @@ pub fn get() -> &'static PageCache {
pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
const MAX_USAGE_COUNT: u8 = 5;
/// See module-level comment.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub struct FileId(u64);
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
/// See module-level comment.
pub fn next_file_id() -> FileId {
FileId(NEXT_ID.fetch_add(1, Ordering::Relaxed))
}
///
/// CacheKey uniquely identifies a "thing" to cache in the page cache.
///
@@ -142,8 +97,12 @@ enum CacheKey {
hash_key: MaterializedPageHashKey,
lsn: Lsn,
},
EphemeralPage {
file_id: u64,
blkno: u32,
},
ImmutableFilePage {
file_id: FileId,
file_id: u64,
blkno: u32,
},
}
@@ -169,6 +128,7 @@ struct Slot {
struct SlotInner {
key: Option<CacheKey>,
buf: &'static mut [u8; PAGE_SZ],
dirty: bool,
}
impl Slot {
@@ -217,7 +177,9 @@ pub struct PageCache {
/// can have a separate mapping map, next to this field.
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,
ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
/// The actual buffers with their metadata.
slots: Box<[Slot]>,
@@ -296,6 +258,14 @@ impl PageWriteGuard<'_> {
);
self.valid = true;
}
pub fn mark_dirty(&mut self) {
// only ephemeral pages can be dirty ATM.
assert!(matches!(
self.inner.key,
Some(CacheKey::EphemeralPage { .. })
));
self.inner.dirty = true;
}
}
impl Drop for PageWriteGuard<'_> {
@@ -310,6 +280,7 @@ impl Drop for PageWriteGuard<'_> {
let self_key = self.inner.key.as_ref().unwrap();
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
self.inner.key = None;
self.inner.dirty = false;
}
}
}
@@ -417,16 +388,50 @@ impl PageCache {
Ok(())
}
// Section 1.2: Public interface functions for working with immutable file pages.
// Section 1.2: Public interface functions for working with Ephemeral pages.
pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
self.lock_for_read(&mut cache_key)
}
pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
let cache_key = CacheKey::EphemeralPage { file_id, blkno };
self.lock_for_write(&cache_key)
}
/// Immediately drop all buffers belonging to given file, without writeback
pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
for slot_idx in 0..self.slots.len() {
let slot = &self.slots[slot_idx];
let mut inner = slot.inner.write().unwrap();
if let Some(key) = &inner.key {
match key {
CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
// remove mapping for old buffer
self.remove_mapping(key);
inner.key = None;
inner.dirty = false;
}
_ => {}
}
}
}
}
// Section 1.3: Public interface functions for working with immutable file pages.
pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
self.lock_for_read(&mut cache_key)
}
/// Immediately drop all buffers belonging to given file
pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
/// Immediately drop all buffers belonging to given file, without writeback
pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
for slot_idx in 0..self.slots.len() {
let slot = &self.slots[slot_idx];
@@ -439,6 +444,7 @@ impl PageCache {
// remove mapping for old buffer
self.remove_mapping(key);
inner.key = None;
inner.dirty = false;
}
_ => {}
}
@@ -516,6 +522,10 @@ impl PageCache {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
}
CacheKey::EphemeralPage { .. } => (
&crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
&crate::metrics::PAGE_CACHE.read_hits_ephemeral,
),
CacheKey::ImmutableFilePage { .. } => (
&crate::metrics::PAGE_CACHE.read_accesses_immutable,
&crate::metrics::PAGE_CACHE.read_hits_immutable,
@@ -556,6 +566,7 @@ impl PageCache {
// Make the slot ready
let slot = &self.slots[slot_idx];
inner.key = Some(cache_key.clone());
inner.dirty = false;
slot.usage_count.store(1, Ordering::Relaxed);
return Ok(ReadBufResult::NotFound(PageWriteGuard {
@@ -617,6 +628,7 @@ impl PageCache {
// Make the slot ready
let slot = &self.slots[slot_idx];
inner.key = Some(cache_key.clone());
inner.dirty = false;
slot.usage_count.store(1, Ordering::Relaxed);
return Ok(WriteBufResult::NotFound(PageWriteGuard {
@@ -655,6 +667,10 @@ impl PageCache {
*lsn = version.lsn;
Some(version.slot_idx)
}
CacheKey::EphemeralPage { file_id, blkno } => {
let map = self.ephemeral_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
@@ -678,6 +694,10 @@ impl PageCache {
None
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let map = self.ephemeral_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
@@ -711,6 +731,12 @@ impl PageCache {
panic!("could not find old key in mapping")
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let mut map = self.ephemeral_page_map.write().unwrap();
map.remove(&(*file_id, *blkno))
.expect("could not find old key in mapping");
self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
map.remove(&(*file_id, *blkno))
@@ -750,7 +776,17 @@ impl PageCache {
}
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let mut map = self.ephemeral_page_map.write().unwrap();
match map.entry((*file_id, *blkno)) {
Entry::Occupied(entry) => Some(*entry.get()),
Entry::Vacant(entry) => {
entry.insert(slot_idx);
self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
None
}
}
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
match map.entry((*file_id, *blkno)) {
@@ -801,8 +837,25 @@ impl PageCache {
}
};
if let Some(old_key) = &inner.key {
if inner.dirty {
if let Err(err) = Self::writeback(old_key, inner.buf) {
// Writing the page to disk failed.
//
// FIXME: What to do here, when? We could propagate the error to the
// caller, but victim buffer is generally unrelated to the original
// call. It can even belong to a different tenant. Currently, we
// report the error to the log and continue the clock sweep to find
// a different victim. But if the problem persists, the page cache
// could fill up with dirty pages that we cannot evict, and we will
// loop retrying the writebacks indefinitely.
error!("writeback of buffer {:?} failed: {}", old_key, err);
continue;
}
}
// remove mapping for old buffer
self.remove_mapping(old_key);
inner.dirty = false;
inner.key = None;
}
return Ok((slot_idx, inner));
@@ -810,6 +863,28 @@ impl PageCache {
}
}
fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
match cache_key {
CacheKey::MaterializedPage {
hash_key: _,
lsn: _,
} => Err(std::io::Error::new(
std::io::ErrorKind::Other,
"unexpected dirty materialized page",
)),
CacheKey::EphemeralPage { file_id, blkno } => {
writeback_ephemeral_file(*file_id, *blkno, buf)
}
CacheKey::ImmutableFilePage {
file_id: _,
blkno: _,
} => Err(std::io::Error::new(
std::io::ErrorKind::Other,
"unexpected dirty immutable page",
)),
}
}
/// Initialize a new page cache
///
/// This should be called only once at page server startup.
@@ -820,6 +895,7 @@ impl PageCache {
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
size_metrics.max_bytes.set_page_sz(num_pages);
size_metrics.current_bytes_ephemeral.set_page_sz(0);
size_metrics.current_bytes_immutable.set_page_sz(0);
size_metrics.current_bytes_materialized_page.set_page_sz(0);
@@ -829,7 +905,11 @@ impl PageCache {
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
Slot {
inner: RwLock::new(SlotInner { key: None, buf }),
inner: RwLock::new(SlotInner {
key: None,
buf,
dirty: false,
}),
usage_count: AtomicU8::new(0),
}
})
@@ -837,6 +917,7 @@ impl PageCache {
Self {
materialized_page_map: Default::default(),
ephemeral_page_map: Default::default(),
immutable_page_map: Default::default(),
slots,
next_evict_slot: AtomicUsize::new(0),

View File

@@ -130,25 +130,11 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("background op worker")
// if you change the number of worker threads please change the constant below
.enable_all()
.build()
.expect("Failed to create background op runtime")
});
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
// force init and thus panics
let _ = BACKGROUND_RUNTIME.handle();
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
// tokio would had already panicked for parsing errors or NotUnicode
//
// this will be wrong if any of the runtimes gets their worker threads configured to something
// else, but that has not been needed in a long time.
std::env::var("TOKIO_WORKER_THREADS")
.map(|s| s.parse::<usize>().unwrap())
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
});
#[derive(Debug, Clone, Copy)]
pub struct PageserverTaskId(u64);
@@ -559,7 +545,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
pub async fn shutdown_watcher() {
let token = SHUTDOWN_TOKEN
.try_with(|t| t.clone())
.expect("shutdown_watcher() called in an unexpected task or thread");
.expect("shutdown_requested() called in an unexpected task or thread");
token.cancelled().await;
}

File diff suppressed because it is too large Load Diff

View File

@@ -16,20 +16,30 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
use std::cmp::min;
use std::io::{Error, ErrorKind};
impl<R> BlockCursor<R>
/// For reading
pub trait BlobCursor {
/// Read a blob into a new buffer.
fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
let mut buf = Vec::new();
self.read_blob_into_buf(offset, &mut buf)?;
Ok(buf)
}
/// Read blob into the given buffer. Any previous contents in the buffer
/// are overwritten.
fn read_blob_into_buf(
&mut self,
offset: u64,
dstbuf: &mut Vec<u8>,
) -> Result<(), std::io::Error>;
}
impl<R> BlobCursor for BlockCursor<R>
where
R: BlockReader,
{
/// Read a blob into a new buffer.
pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
let mut buf = Vec::new();
self.read_blob_into_buf(offset, &mut buf).await?;
Ok(buf)
}
/// Read blob into the given buffer. Any previous contents in the buffer
/// are overwritten.
pub async fn read_blob_into_buf(
&self,
fn read_blob_into_buf(
&mut self,
offset: u64,
dstbuf: &mut Vec<u8>,
) -> Result<(), std::io::Error> {

View File

@@ -2,10 +2,12 @@
//! Low-level Block-oriented I/O functions
//!
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
use crate::page_cache;
use crate::page_cache::{ReadBufResult, PAGE_SZ};
use bytes::Bytes;
use std::ops::{Deref, DerefMut};
use std::os::unix::fs::FileExt;
use std::sync::atomic::AtomicU64;
/// This is implemented by anything that can read 8 kB (PAGE_SZ)
/// blocks, using the page cache
@@ -13,12 +15,14 @@ use std::os::unix::fs::FileExt;
/// There are currently two implementations: EphemeralFile, and FileBlockReader
/// below.
pub trait BlockReader {
type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
///
/// Read a block. Returns a "lease" object that can be used to
/// access to the contents of the page. (For the page cache, the
/// lease object represents a lock on the buffer.)
///
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;
///
/// Create a new "cursor" for reading from this reader.
@@ -37,45 +41,13 @@ impl<B> BlockReader for &B
where
B: BlockReader,
{
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
type BlockLease = B::BlockLease;
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
(*self).read_blk(blknum)
}
}
/// Reference to an in-memory copy of an immutable on-disk block.
pub enum BlockLease<'a> {
PageReadGuard(PageReadGuard<'static>),
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
#[cfg(test)]
Rc(std::rc::Rc<[u8; PAGE_SZ]>),
}
impl From<PageReadGuard<'static>> for BlockLease<'static> {
fn from(value: PageReadGuard<'static>) -> BlockLease<'static> {
BlockLease::PageReadGuard(value)
}
}
#[cfg(test)]
impl<'a> From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease<'a> {
fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
BlockLease::Rc(value)
}
}
impl<'a> Deref for BlockLease<'a> {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
match self {
BlockLease::PageReadGuard(v) => v.deref(),
BlockLease::EphemeralFileMutableTail(v) => v,
#[cfg(test)]
BlockLease::Rc(v) => v.deref(),
}
}
}
///
/// A "cursor" for efficiently reading multiple pages from a BlockReader
///
@@ -108,10 +80,11 @@ where
BlockCursor { reader }
}
pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
self.reader.read_blk(blknum)
}
}
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
/// An adapter for reading a (virtual) file using the page cache.
///
@@ -121,7 +94,7 @@ pub struct FileBlockReader<F> {
pub file: F,
/// Unique ID of this file, used as key in the page cache.
file_id: page_cache::FileId,
file_id: u64,
}
impl<F> FileBlockReader<F>
@@ -129,7 +102,7 @@ where
F: FileExt,
{
pub fn new(file: F) -> Self {
let file_id = page_cache::next_file_id();
let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
FileBlockReader { file_id, file }
}
@@ -145,7 +118,10 @@ impl<F> BlockReader for FileBlockReader<F>
where
F: FileExt,
{
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
type BlockLease = page_cache::PageReadGuard<'static>;
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
// Look up the right page
let cache = page_cache::get();
loop {
match cache
@@ -156,7 +132,7 @@ where
format!("Failed to read immutable buf: {e:#}"),
)
})? {
ReadBufResult::Found(guard) => break Ok(guard.into()),
ReadBufResult::Found(guard) => break Ok(guard),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
self.fill_buffer(write_guard.deref_mut(), blknum)?;

View File

@@ -1,610 +0,0 @@
use std::{
path::{Path, PathBuf},
sync::Arc,
};
use anyhow::Context;
use pageserver_api::models::TenantState;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard;
use tracing::{error, info, instrument, warn, Instrument, Span};
use utils::{
backoff, completion, crashsafe, fs_ext,
id::{TenantId, TimelineId},
};
use crate::{
config::PageServerConf,
context::RequestContext,
task_mgr::{self, TaskKind},
InitializationOrder,
};
use super::{
mgr::{GetTenantError, TenantsMap},
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
span,
timeline::delete::DeleteTimelineFlow,
tree_sort_timelines, DeleteTimelineError, Tenant,
};
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
#[derive(Debug, thiserror::Error)]
pub enum DeleteTenantError {
#[error("GetTenant {0}")]
Get(#[from] GetTenantError),
#[error("Invalid state {0}. Expected Active or Broken")]
InvalidState(TenantState),
#[error("Tenant deletion is already in progress")]
AlreadyInProgress,
#[error("Timeline {0}")]
Timeline(#[from] DeleteTimelineError),
#[error(transparent)]
Other(#[from] anyhow::Error),
}
type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
fn remote_tenant_delete_mark_path(
conf: &PageServerConf,
tenant_id: &TenantId,
) -> anyhow::Result<RemotePath> {
let tenant_remote_path = conf
.tenant_path(tenant_id)
.strip_prefix(&conf.workdir)
.context("Failed to strip workdir prefix")
.and_then(RemotePath::new)
.context("tenant path")?;
Ok(tenant_remote_path.join(Path::new("deleted")))
}
async fn create_remote_delete_mark(
conf: &PageServerConf,
remote_storage: &GenericRemoteStorage,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
let data: &[u8] = &[];
backoff::retry(
|| async {
remote_storage
.upload(data, 0, &remote_mark_path, None)
.await
},
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"mark_upload",
)
.await
.context("mark_upload")?;
Ok(())
}
async fn create_local_delete_mark(
conf: &PageServerConf,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
// Note: we're ok to replace existing file.
let _ = std::fs::OpenOptions::new()
.write(true)
.create(true)
.open(&marker_path)
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
Ok(())
}
async fn schedule_ordered_timeline_deletions(
tenant: &Arc<Tenant>,
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
// Tenant is stopping at this point. We know it will be deleted.
// No new timelines should be created.
// Tree sort timelines to delete from leafs to the root.
// NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
// can complete and remove timeline from the map in between our call to clone
// and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
// timelines.lock is currently synchronous so we cant hold it across await point.
// So just ignore NotFound error if we get it from `run`.
// Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
let timelines = tenant.timelines.lock().unwrap().clone();
let sorted =
tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
let mut already_running_deletions = vec![];
for (timeline_id, _) in sorted.into_iter().rev() {
if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
match e {
DeleteTimelineError::NotFound => {
// Timeline deletion finished after call to clone above but before call
// to `DeleteTimelineFlow::run` and removed timeline from the map.
continue;
}
DeleteTimelineError::AlreadyInProgress(guard) => {
already_running_deletions.push((guard, timeline_id));
continue;
}
e => return Err(DeleteTenantError::Timeline(e)),
}
}
}
Ok(already_running_deletions)
}
async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
// Assert timelines dir is empty.
if !fs_ext::is_directory_empty(timelines_path).await? {
// Display first 10 items in directory
let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"Timelines directory is not empty after all timelines deletion: {list:?}"
)));
}
Ok(())
}
async fn remove_tenant_remote_delete_mark(
conf: &PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
if let Some(remote_storage) = remote_storage {
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
backoff::retry(
|| async { remote_storage.delete(&path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
)
.await
.context("remove_tenant_remote_delete_mark")?;
}
Ok(())
}
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
async fn cleanup_remaining_fs_traces(
conf: &PageServerConf,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let rm = |p: PathBuf, is_dir: bool| async move {
if is_dir {
tokio::fs::remove_dir(&p).await
} else {
tokio::fs::remove_file(&p).await
}
.or_else(fs_ext::ignore_not_found)
.with_context(|| {
let to_display = p.display();
format!("failed to delete {to_display}")
})
};
rm(conf.tenant_config_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-timelines-dir"
))?
});
rm(conf.timelines_path(tenant_id), true).await?;
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-deleted-mark"
))?
});
// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
// So its possible for mark to reach disk first and for other deletions
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let tenant_path = &conf.tenant_path(tenant_id);
if tenant_path.exists() {
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
.await
.context("fsync_pre_mark_remove")?;
}
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-tenant-dir"
))?
});
rm(conf.tenant_path(tenant_id), true).await?;
Ok(())
}
pub(crate) async fn remote_delete_mark_exists(
conf: &PageServerConf,
tenant_id: &TenantId,
remote_storage: &GenericRemoteStorage,
) -> anyhow::Result<bool> {
// If remote storage is there we rely on it
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
let result = backoff::retry(
|| async { remote_storage.download(&remote_mark_path).await },
|e| matches!(e, DownloadError::NotFound),
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
"fetch_tenant_deletion_mark",
)
.await;
match result {
Ok(_) => Ok(true),
Err(DownloadError::NotFound) => Ok(false),
Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
}
}
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
/// and deletes its data from both disk and s3.
/// The sequence of steps:
/// 1. Upload remote deletion mark.
/// 2. Create local mark file.
/// 3. Shutdown tasks
/// 4. Run ordered timeline deletions
/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
/// 6. Remove remote mark
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
/// It is resumable from any step in case a crash/restart occurs.
/// There are three entrypoints to the process:
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
#[derive(Default)]
pub enum DeleteTenantFlow {
#[default]
NotStarted,
InProgress,
Finished,
}
impl DeleteTenantFlow {
// These steps are run in the context of management api request handler.
// Long running steps are continued to run in the background.
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
// NOTE: static needed for background part.
// We assume that calling code sets up the span with tenant_id.
#[instrument(skip_all)]
pub(crate) async fn run(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();
let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
tenant.set_broken(format!("{e:#}")).await;
return Err(e);
}
Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
Ok(())
}
// Helper function needed to be able to match once on returned error and transition tenant into broken state.
// This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
// will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
// So the solution is to set tenant state to broken.
async fn run_inner(
guard: &mut OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant: &Tenant,
) -> Result<(), DeleteTenantError> {
guard.mark_in_progress()?;
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-create-remote-mark"
))?
});
// IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
.await
.context("remote_mark")?
}
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-create-local-mark"
))?
});
create_local_delete_mark(conf, &tenant.tenant_id)
.await
.context("local delete mark")?;
fail::fail_point!("tenant-delete-before-background", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-background"
))?
});
Ok(())
}
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
match self {
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
Self::InProgress { .. } => { /* We're in a retry */ }
Self::NotStarted => { /* Fresh start */ }
}
*self = Self::InProgress;
Ok(())
}
pub async fn should_resume_deletion(
conf: &'static PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant: &Tenant,
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
let acquire = |t: &Tenant| {
Some(
Arc::clone(&t.delete_progress)
.try_lock_owned()
.expect("we're the only owner during init"),
)
};
let tenant_id = tenant.tenant_id;
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
return Ok(acquire(tenant));
}
let remote_storage = match remote_storage {
Some(remote_storage) => remote_storage,
None => return Ok(None),
};
if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
Ok(acquire(tenant))
} else {
Ok(None)
}
}
pub(crate) async fn resume_from_load(
guard: DeletionGuard,
tenant: &Arc<Tenant>,
init_order: Option<&InitializationOrder>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
ctx: &RequestContext,
) -> Result<(), DeleteTenantError> {
let (_, progress) = completion::channel();
tenant
.set_stopping(progress, true, false)
.await
.expect("cant be stopping or broken");
// Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
if let Some(background) = background_jobs_can_start {
info!("waiting for backgound jobs barrier");
background.clone().wait().await;
info!("ready for backgound jobs barrier");
}
// Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
if timelines_path.exists() {
tenant.load(init_order, ctx).await.context("load")?;
}
Self::background(
guard,
tenant.conf,
tenant.remote_storage.clone(),
tenants,
tenant,
)
.await
}
pub(crate) async fn resume_from_attach(
guard: DeletionGuard,
tenant: &Arc<Tenant>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
ctx: &RequestContext,
) -> Result<(), DeleteTenantError> {
let (_, progress) = completion::channel();
tenant
.set_stopping(progress, false, true)
.await
.expect("cant be stopping or broken");
tenant.attach(ctx).await.context("attach")?;
Self::background(
guard,
tenant.conf,
tenant.remote_storage.clone(),
tenants,
tenant,
)
.await
}
async fn prepare(
tenants: &tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
let m = tenants.read().await;
let tenant = m
.get(&tenant_id)
.ok_or(GetTenantError::NotFound(tenant_id))?;
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
// so at least for now allow deletions only for active tenants. TODO recheck
// Broken and Stopping is needed for retries.
if !matches!(
tenant.current_state(),
TenantState::Active | TenantState::Broken { .. }
) {
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
}
let guard = Arc::clone(&tenant.delete_progress)
.try_lock_owned()
.map_err(|_| DeleteTenantError::AlreadyInProgress)?;
fail::fail_point!("tenant-delete-before-shutdown", |_| {
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
});
// make pageserver shutdown not to wait for our completion
let (_, progress) = completion::channel();
// It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
// i e it is an error to do:
// tenant.set_stopping
// tenant.shutdown
// Its also bad that we're holding tenants.read here.
// TODO relax set_stopping to be idempotent?
if tenant.shutdown(progress, false).await.is_err() {
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"tenant shutdown is already in progress"
)));
}
Ok((Arc::clone(tenant), guard))
}
fn schedule_background(
guard: OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) {
let tenant_id = tenant.tenant_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_id),
None,
"tenant_delete",
false,
async move {
if let Err(err) =
Self::background(guard, conf, remote_storage, tenants, &tenant).await
{
error!("Error: {err:#}");
tenant.set_broken(format!("{err:#}")).await;
};
Ok(())
}
.instrument({
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
span.follows_from(Span::current());
span
}),
);
}
async fn background(
mut guard: OwnedMutexGuard<Self>,
conf: &PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenant: &Arc<Tenant>,
) -> Result<(), DeleteTenantError> {
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
// Note that if deletion fails we dont mark timelines as broken,
// the whole tenant will become broken as by `Self::schedule_background` logic
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
.await
.context("schedule_ordered_timeline_deletions")?;
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-polling-ongoing-deletions"
))?
});
// Wait for deletions that were already running at the moment when tenant deletion was requested.
// When we can lock deletion guard it means that corresponding timeline deletion finished.
for (guard, timeline_id) in already_running_timeline_deletions {
let flow = guard.lock().await;
if !flow.is_finished() {
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"already running timeline deletion failed: {timeline_id}"
)));
}
}
let timelines_path = conf.timelines_path(&tenant.tenant_id);
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
if timelines_path.exists() {
// sanity check to guard against layout changes
ensure_timelines_dir_empty(&timelines_path)
.await
.context("timelines dir not empty")?;
}
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
))?
});
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
.await
.context("cleanup_remaining_fs_traces")?;
let mut locked = tenants.write().await;
if locked.remove(&tenant.tenant_id).is_none() {
warn!("Tenant got removed from tenants map during deletion");
};
*guard = Self::Finished;
Ok(())
}
}

View File

@@ -20,7 +20,6 @@
//!
use byteorder::{ReadBytesExt, BE};
use bytes::{BufMut, Bytes, BytesMut};
use either::Either;
use hex;
use std::{cmp::Ordering, io, result};
use thiserror::Error;
@@ -231,15 +230,14 @@ where
///
/// Read the value for given key. Returns the value, or None if it doesn't exist.
///
pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
let mut result: Option<u64> = None;
self.visit(search_key, VisitDirection::Forwards, |key, value| {
if key == search_key {
result = Some(value);
}
false
})
.await?;
})?;
Ok(result)
}
@@ -248,7 +246,7 @@ where
/// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
/// backwards)
///
pub async fn visit<V>(
pub fn visit<V>(
&self,
search_key: &[u8; L],
dir: VisitDirection,
@@ -257,77 +255,77 @@ where
where
V: FnMut(&[u8], u64) -> bool,
{
let mut stack = Vec::new();
stack.push((self.root_blk, None));
while let Some((node_blknum, opt_iter)) = stack.pop() {
// Locate the node.
let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
}
let node = OnDiskNode::deparse(node_buf.as_ref())?;
let prefix_len = node.prefix_len as usize;
let suffix_len = node.suffix_len as usize;
fn search_recurse<V>(
&self,
node_blknum: u32,
search_key: &[u8; L],
dir: VisitDirection,
visitor: &mut V,
) -> Result<bool>
where
V: FnMut(&[u8], u64) -> bool,
{
// Locate the node.
let blk = self.reader.read_blk(self.start_blk + node_blknum)?;
assert!(node.num_children > 0);
// Search all entries on this node
self.search_node(blk.as_ref(), search_key, dir, visitor)
}
let mut keybuf = Vec::new();
keybuf.extend(node.prefix);
keybuf.resize(prefix_len + suffix_len, 0);
fn search_node<V>(
&self,
node_buf: &[u8],
search_key: &[u8; L],
dir: VisitDirection,
visitor: &mut V,
) -> Result<bool>
where
V: FnMut(&[u8], u64) -> bool,
{
let node = OnDiskNode::deparse(node_buf)?;
let prefix_len = node.prefix_len as usize;
let suffix_len = node.suffix_len as usize;
let mut iter = if let Some(iter) = opt_iter {
iter
} else if dir == VisitDirection::Forwards {
// Locate the first match
let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
Ok(idx) => idx,
Err(idx) => {
if node.level == 0 {
// Imagine that the node contains the following keys:
//
// 1
// 3 <-- idx
// 5
//
// If the search key is '2' and there is exact match,
// the binary search would return the index of key
// '3'. That's cool, '3' is the first key to return.
idx
} else {
// This is an internal page, so each key represents a lower
// bound for what's in the child page. If there is no exact
// match, we have to return the *previous* entry.
//
// 1 <-- return this
// 3 <-- idx
// 5
idx.saturating_sub(1)
}
}
};
Either::Left(idx..node.num_children.into())
} else {
let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
Ok(idx) => {
// Exact match. That's the first entry to return, and walk
// backwards from there.
assert!(node.num_children > 0);
let mut keybuf = Vec::new();
keybuf.extend(node.prefix);
keybuf.resize(prefix_len + suffix_len, 0);
if dir == VisitDirection::Forwards {
// Locate the first match
let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
Ok(idx) => idx,
Err(idx) => {
if node.level == 0 {
// Imagine that the node contains the following keys:
//
// 1
// 3 <-- idx
// 5
//
// If the search key is '2' and there is exact match,
// the binary search would return the index of key
// '3'. That's cool, '3' is the first key to return.
idx
} else {
// This is an internal page, so each key represents a lower
// bound for what's in the child page. If there is no exact
// match, we have to return the *previous* entry.
//
// 1 <-- return this
// 3 <-- idx
// 5
idx.saturating_sub(1)
}
Err(idx) => {
// No exact match. The binary search returned the index of the
// first key that's > search_key. Back off by one, and walk
// backwards from there.
if let Some(idx) = idx.checked_sub(1) {
idx
} else {
return Ok(false);
}
}
};
Either::Right((0..=idx).rev())
}
};
// idx points to the first match now. Keep going from there
while let Some(idx) = iter.next() {
let key_off = idx * suffix_len;
let mut key_off = idx * suffix_len;
while idx < node.num_children as usize {
let suffix = &node.keys[key_off..key_off + suffix_len];
keybuf[prefix_len..].copy_from_slice(suffix);
let value = node.value(idx);
@@ -338,8 +336,52 @@ where
return Ok(false);
}
} else {
stack.push((node_blknum, Some(iter)));
stack.push((value.to_blknum(), None));
#[allow(clippy::collapsible_if)]
if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
return Ok(false);
}
}
idx += 1;
key_off += suffix_len;
}
} else {
let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
Ok(idx) => {
// Exact match. That's the first entry to return, and walk
// backwards from there. (The loop below starts from 'idx -
// 1', so add one here to compensate.)
idx + 1
}
Err(idx) => {
// No exact match. The binary search returned the index of the
// first key that's > search_key. Back off by one, and walk
// backwards from there. (The loop below starts from idx - 1,
// so we don't need to subtract one here)
idx
}
};
// idx points to the first match + 1 now. Keep going from there.
let mut key_off = idx * suffix_len;
while idx > 0 {
idx -= 1;
key_off -= suffix_len;
let suffix = &node.keys[key_off..key_off + suffix_len];
keybuf[prefix_len..].copy_from_slice(suffix);
let value = node.value(idx);
#[allow(clippy::collapsible_if)]
if node.level == 0 {
// leaf
if !visitor(&keybuf, value.to_u64()) {
return Ok(false);
}
} else {
#[allow(clippy::collapsible_if)]
if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
return Ok(false);
}
}
if idx == 0 {
break;
}
}
@@ -348,42 +390,39 @@ where
}
#[allow(dead_code)]
pub async fn dump(&self) -> Result<()> {
let mut stack = Vec::new();
pub fn dump(&self) -> Result<()> {
self.dump_recurse(self.root_blk, &[], 0)
}
stack.push((self.root_blk, String::new(), 0, 0, 0));
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
let blk = self.reader.read_blk(self.start_blk + blknum)?;
let buf: &[u8] = blk.as_ref();
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
let blk = self.reader.read_blk(self.start_blk + blknum)?;
let buf: &[u8] = blk.as_ref();
let node = OnDiskNode::<L>::deparse(buf)?;
let node = OnDiskNode::<L>::deparse(buf)?;
if child_idx == 0 {
print!("{:indent$}", "", indent = depth * 2);
let path_prefix = stack
.iter()
.map(|(_blknum, path, ..)| path.as_str())
.collect::<String>();
println!(
"blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
hex::encode(node.prefix),
node.suffix_len
);
}
print!("{:indent$}", "", indent = depth * 2);
println!(
"blk #{}: path {}: prefix {}, suffix_len {}",
blknum,
hex::encode(path),
hex::encode(node.prefix),
node.suffix_len
);
if child_idx + 1 < node.num_children {
let key_off = key_off + node.suffix_len as usize;
stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
}
let mut idx = 0;
let mut key_off = 0;
while idx < node.num_children {
let key = &node.keys[key_off..key_off + node.suffix_len as usize];
let val = node.value(child_idx as usize);
let val = node.value(idx as usize);
print!("{:indent$}", "", indent = depth * 2 + 2);
println!("{}: {}", hex::encode(key), hex::encode(val.0));
if node.level > 0 {
stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
let child_path = [path, node.prefix].concat();
self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
}
idx += 1;
key_off += node.suffix_len as usize;
}
Ok(())
}
@@ -685,7 +724,6 @@ impl<const L: usize> BuildNode<L> {
#[cfg(test)]
mod tests {
use super::*;
use crate::tenant::block_io::BlockLease;
use rand::Rng;
use std::collections::BTreeMap;
use std::sync::atomic::{AtomicUsize, Ordering};
@@ -700,10 +738,12 @@ mod tests {
}
}
impl BlockReader for TestDisk {
fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
let mut buf = [0u8; PAGE_SZ];
buf.copy_from_slice(&self.blocks[blknum as usize]);
Ok(std::rc::Rc::new(buf).into())
Ok(std::rc::Rc::new(buf))
}
}
impl BlockWriter for &mut TestDisk {
@@ -714,8 +754,8 @@ mod tests {
}
}
#[tokio::test]
async fn basic() -> Result<()> {
#[test]
fn basic() -> Result<()> {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
@@ -735,16 +775,16 @@ mod tests {
let reader = DiskBtreeReader::new(0, root_offset, disk);
reader.dump().await?;
reader.dump()?;
// Test the `get` function on all the keys.
for (key, val) in all_data.iter() {
assert_eq!(reader.get(key).await?, Some(*val));
assert_eq!(reader.get(key)?, Some(*val));
}
// And on some keys that don't exist
assert_eq!(reader.get(b"aaaaaa").await?, None);
assert_eq!(reader.get(b"zzzzzz").await?, None);
assert_eq!(reader.get(b"xaaabx").await?, None);
assert_eq!(reader.get(b"aaaaaa")?, None);
assert_eq!(reader.get(b"zzzzzz")?, None);
assert_eq!(reader.get(b"xaaabx")?, None);
// Test search with `visit` function
let search_key = b"xabaaa";
@@ -755,12 +795,10 @@ mod tests {
.collect();
let mut data = Vec::new();
reader
.visit(search_key, VisitDirection::Forwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
reader.visit(search_key, VisitDirection::Forwards, |key, value| {
data.push((key.to_vec(), value));
true
})?;
assert_eq!(data, expected);
// Test a backwards scan
@@ -771,20 +809,16 @@ mod tests {
.collect();
expected.reverse();
let mut data = Vec::new();
reader
.visit(search_key, VisitDirection::Backwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
reader.visit(search_key, VisitDirection::Backwards, |key, value| {
data.push((key.to_vec(), value));
true
})?;
assert_eq!(data, expected);
// Backward scan where nothing matches
reader
.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
panic!("found unexpected key {}: {}", hex::encode(key), value);
})
.await?;
reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
panic!("found unexpected key {}: {}", hex::encode(key), value);
})?;
// Full scan
let expected: Vec<(Vec<u8>, u64)> = all_data
@@ -792,19 +826,17 @@ mod tests {
.map(|(key, value)| (key.to_vec(), *value))
.collect();
let mut data = Vec::new();
reader
.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
data.push((key.to_vec(), value));
true
})?;
assert_eq!(data, expected);
Ok(())
}
#[tokio::test]
async fn lots_of_keys() -> Result<()> {
#[test]
fn lots_of_keys() -> Result<()> {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
@@ -824,7 +856,7 @@ mod tests {
let reader = DiskBtreeReader::new(0, root_offset, disk);
reader.dump().await?;
reader.dump()?;
use std::sync::Mutex;
@@ -845,15 +877,13 @@ mod tests {
for search_key_int in 0..(NUM_KEYS * 2 + 10) {
let search_key = u64::to_be_bytes(search_key_int);
assert_eq!(
reader.get(&search_key).await?,
reader.get(&search_key)?,
all_data.get(&search_key_int).cloned()
);
// Test a forward scan starting with this key
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Forwards, take_ten)
.await?;
reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
let expected = all_data
.range(search_key_int..)
.take(10)
@@ -863,9 +893,7 @@ mod tests {
// And a backwards scan
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Backwards, take_ten)
.await?;
reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
let expected = all_data
.range(..=search_key_int)
.rev()
@@ -879,9 +907,7 @@ mod tests {
let search_key = u64::to_be_bytes(0);
limit.store(usize::MAX, Ordering::Relaxed);
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Forwards, take_ten)
.await?;
reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
let expected = all_data
.iter()
.map(|(&key, &val)| (key, val))
@@ -892,9 +918,7 @@ mod tests {
let search_key = u64::to_be_bytes(u64::MAX);
limit.store(usize::MAX, Ordering::Relaxed);
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Backwards, take_ten)
.await?;
reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
let expected = all_data
.iter()
.rev()
@@ -905,8 +929,8 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn random_data() -> Result<()> {
#[test]
fn random_data() -> Result<()> {
// Generate random keys with exponential distribution, to
// exercise the prefix compression
const NUM_KEYS: usize = 100000;
@@ -933,23 +957,19 @@ mod tests {
// Test get() operation on all the keys
for (&key, &val) in all_data.iter() {
let search_key = u128::to_be_bytes(key);
assert_eq!(reader.get(&search_key).await?, Some(val));
assert_eq!(reader.get(&search_key)?, Some(val));
}
// Test get() operations on random keys, most of which will not exist
for _ in 0..100000 {
let key_int = rand::thread_rng().gen::<u128>();
let search_key = u128::to_be_bytes(key_int);
assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
}
// Test boundary cases
assert!(
reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
);
assert!(
reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
);
assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());
Ok(())
}
@@ -974,8 +994,8 @@ mod tests {
///
/// This test contains a particular data set, see disk_btree_test_data.rs
///
#[tokio::test]
async fn particular_data() -> Result<()> {
#[test]
fn particular_data() -> Result<()> {
// Build a tree from it
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -991,20 +1011,18 @@ mod tests {
// Test get() operation on all the keys
for (key, val) in disk_btree_test_data::TEST_DATA {
assert_eq!(reader.get(&key).await?, Some(val));
assert_eq!(reader.get(&key)?, Some(val));
}
// Test full scan
let mut count = 0;
reader
.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
count += 1;
true
})
.await?;
reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
count += 1;
true
})?;
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
reader.dump().await?;
reader.dump()?;
Ok(())
}

View File

@@ -2,31 +2,46 @@
//! used to keep in-memory layers spilled on disk.
use crate::config::PageServerConf;
use crate::page_cache::{self, PAGE_SZ};
use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockLease, BlockReader};
use crate::tenant::block_io::BlockReader;
use crate::virtual_file::VirtualFile;
use once_cell::sync::Lazy;
use std::cmp::min;
use std::collections::HashMap;
use std::fs::OpenOptions;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::os::unix::prelude::FileExt;
use std::path::PathBuf;
use std::sync::atomic::AtomicU64;
use std::sync::{Arc, RwLock};
use tracing::*;
use utils::id::{TenantId, TimelineId};
pub struct EphemeralFile {
page_cache_file_id: page_cache::FileId,
use std::os::unix::fs::FileExt;
///
/// This is the global cache of file descriptors (File objects).
///
static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
RwLock::new(EphemeralFiles {
next_file_id: 1,
files: HashMap::new(),
})
});
pub struct EphemeralFiles {
next_file_id: u64,
files: HashMap<u64, Arc<VirtualFile>>,
}
pub struct EphemeralFile {
file_id: u64,
_tenant_id: TenantId,
_timeline_id: TimelineId,
file: VirtualFile,
size: u64,
/// An ephemeral file is append-only.
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
/// The other pages, which can no longer be modified, are accessed through the page cache.
mutable_tail: [u8; PAGE_SZ],
file: Arc<VirtualFile>,
pub size: u64,
}
impl EphemeralFile {
@@ -35,31 +50,71 @@ impl EphemeralFile {
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<EphemeralFile, io::Error> {
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
let filename_disambiguator =
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let mut l = EPHEMERAL_FILES.write().unwrap();
let file_id = l.next_file_id;
l.next_file_id += 1;
let filename = conf
.timeline_path(&tenant_id, &timeline_id)
.join(PathBuf::from(format!("ephemeral-{filename_disambiguator}")));
.join(PathBuf::from(format!("ephemeral-{}", file_id)));
let file = VirtualFile::open_with_options(
&filename,
OpenOptions::new().read(true).write(true).create(true),
)?;
let file_rc = Arc::new(file);
l.files.insert(file_id, file_rc.clone());
Ok(EphemeralFile {
page_cache_file_id: page_cache::next_file_id(),
file_id,
_tenant_id: tenant_id,
_timeline_id: timeline_id,
file,
file: file_rc,
size: 0,
mutable_tail: [0u8; PAGE_SZ],
})
}
pub(crate) fn size(&self) -> u64 {
self.size
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
let mut off = 0;
while off < PAGE_SZ {
let n = self
.file
.read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
if n == 0 {
// Reached EOF. Fill the rest of the buffer with zeros.
const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
buf[off..].copy_from_slice(&ZERO_BUF[off..]);
break;
}
off += n;
}
Ok(())
}
fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
// Look up the right page
let cache = page_cache::get();
let mut write_guard = match cache
.write_ephemeral_buf(self.file_id, blkno)
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
{
WriteBufResult::Found(guard) => guard,
WriteBufResult::NotFound(mut guard) => {
// Read the page from disk into the buffer
// TODO: if we're overwriting the whole page, no need to read it in first
self.fill_buffer(guard.deref_mut(), blkno)?;
guard.mark_valid();
// And then fall through to modify it.
guard
}
};
write_guard.mark_dirty();
Ok(write_guard)
}
}
@@ -72,104 +127,121 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
}
}
impl FileExt for EphemeralFile {
fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
// Look up the right page
let blkno = (offset / PAGE_SZ as u64) as u32;
let off = offset as usize % PAGE_SZ;
let len = min(PAGE_SZ - off, dstbuf.len());
let read_guard;
let mut write_guard;
let cache = page_cache::get();
let buf = match cache
.read_ephemeral_buf(self.file_id, blkno)
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
{
ReadBufResult::Found(guard) => {
read_guard = guard;
read_guard.as_ref()
}
ReadBufResult::NotFound(guard) => {
// Read the page from disk into the buffer
write_guard = guard;
self.fill_buffer(write_guard.deref_mut(), blkno)?;
write_guard.mark_valid();
// And then fall through to read the requested slice from the
// buffer.
write_guard.as_ref()
}
};
dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
Ok(len)
}
fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
// Look up the right page
let blkno = (offset / PAGE_SZ as u64) as u32;
let off = offset as usize % PAGE_SZ;
let len = min(PAGE_SZ - off, srcbuf.len());
let mut write_guard;
let cache = page_cache::get();
let buf = match cache
.write_ephemeral_buf(self.file_id, blkno)
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
{
WriteBufResult::Found(guard) => {
write_guard = guard;
write_guard.deref_mut()
}
WriteBufResult::NotFound(guard) => {
// Read the page from disk into the buffer
// TODO: if we're overwriting the whole page, no need to read it in first
write_guard = guard;
self.fill_buffer(write_guard.deref_mut(), blkno)?;
write_guard.mark_valid();
// And then fall through to modify it.
write_guard.deref_mut()
}
};
buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
write_guard.mark_dirty();
Ok(len)
}
}
impl BlobWriter for EphemeralFile {
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
struct Writer<'a> {
ephemeral_file: &'a mut EphemeralFile,
/// The block to which the next [`push_bytes`] will write.
blknum: u32,
/// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
off: usize,
}
impl<'a> Writer<'a> {
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
Ok(Writer {
blknum: (ephemeral_file.size / PAGE_SZ as u64) as u32,
off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
ephemeral_file,
})
}
#[inline(always)]
fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
let mut src_remaining = src;
while !src_remaining.is_empty() {
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
let n = min(dst_remaining.len(), src_remaining.len());
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
self.off += n;
src_remaining = &src_remaining[n..];
if self.off == PAGE_SZ {
match self.ephemeral_file.file.write_all_at(
&self.ephemeral_file.mutable_tail,
self.blknum as u64 * PAGE_SZ as u64,
) {
Ok(_) => {
// Pre-warm the page cache with what we just wrote.
// This isn't necessary for coherency/correctness, but it's how we've always done it.
let cache = page_cache::get();
match cache.read_immutable_buf(
self.ephemeral_file.page_cache_file_id,
self.blknum,
) {
Ok(page_cache::ReadBufResult::Found(_guard)) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
}
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
write_guard.mark_valid();
// pre-warm successful
}
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
}
// Zero the buffer for re-use.
// Zeroing is critical for correcntess because the write_blob code below
// and similarly read_blk expect zeroed pages.
self.ephemeral_file.mutable_tail.fill(0);
// This block is done, move to next one.
self.blknum += 1;
self.off = 0;
}
Err(e) => {
return Err(std::io::Error::new(
ErrorKind::Other,
// order error before path because path is long and error is short
format!(
"ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
self.blknum,
e,
self.ephemeral_file.file.path.display(),
),
));
}
}
}
}
Ok(())
}
}
let pos = self.size;
let mut writer = Writer::new(self)?;
let mut blknum = (self.size / PAGE_SZ as u64) as u32;
let mut off = (pos % PAGE_SZ as u64) as usize;
let mut buf = self.get_buf_for_write(blknum)?;
// Write the length field
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
writer.push_bytes(&len_buf)?;
buf[off] = srcbuf.len() as u8;
off += 1;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
writer.push_bytes(&len_buf)?;
let thislen = PAGE_SZ - off;
if thislen < 4 {
// it needs to be split across pages
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
blknum += 1;
buf = self.get_buf_for_write(blknum)?;
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
off = 4 - thislen;
} else {
buf[off..off + 4].copy_from_slice(&len_buf);
off += 4;
}
}
// Write the payload
writer.push_bytes(srcbuf)?;
let mut buf_remain = srcbuf;
while !buf_remain.is_empty() {
let mut page_remain = PAGE_SZ - off;
if page_remain == 0 {
blknum += 1;
buf = self.get_buf_for_write(blknum)?;
off = 0;
page_remain = PAGE_SZ;
}
let this_blk_len = min(page_remain, buf_remain.len());
buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]);
off += this_blk_len;
buf_remain = &buf_remain[this_blk_len..];
}
drop(buf);
if srcbuf.len() < 0x80 {
self.size += 1;
@@ -186,74 +258,79 @@ impl Drop for EphemeralFile {
fn drop(&mut self) {
// drop all pages from page cache
let cache = page_cache::get();
cache.drop_buffers_for_immutable(self.page_cache_file_id);
cache.drop_buffers_for_ephemeral(self.file_id);
// remove entry from the hash map
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
// unlink the file
let res = std::fs::remove_file(&self.file.path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
// just never log the not found errors, we cannot do anything for them; on detach
// the tenant directory is already gone.
//
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
error!(
"could not remove ephemeral file '{}': {}",
self.file.path.display(),
e
);
}
warn!(
"could not remove ephemeral file '{}': {}",
self.file.path.display(),
e
);
}
}
}
impl BlockReader for EphemeralFile {
fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
let flushed_blknums = 0..self.size / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
loop {
match cache
.read_immutable_buf(self.page_cache_file_id, blknum)
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum,
self.file.path.display(),
e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
write_guard.mark_valid();
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
Ok(_) => Ok(()),
Err(e) => Err(io::Error::new(
ErrorKind::Other,
format!(
"failed to write back to ephemeral file at {} error: {}",
file.path.display(),
e
),
)),
}
} else {
Err(io::Error::new(
ErrorKind::Other,
"could not write back page, not found in ephemeral files hash",
))
}
}
// Swap for read lock
continue;
}
};
}
} else {
debug_assert_eq!(blknum as u64, self.size / PAGE_SZ as u64);
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
impl BlockReader for EphemeralFile {
type BlockLease = page_cache::PageReadGuard<'static>;
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
// Look up the right page
let cache = page_cache::get();
loop {
match cache
.read_ephemeral_buf(self.file_id, blknum)
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
{
ReadBufResult::Found(guard) => return Ok(guard),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
self.fill_buffer(write_guard.deref_mut(), blknum)?;
write_guard.mark_valid();
// Swap for read lock
continue;
}
};
}
}
}
fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
use crate::tenant::block_io::BlockCursor;
use rand::{thread_rng, RngCore};
use rand::{seq::SliceRandom, thread_rng, RngCore};
use std::fs;
use std::str::FromStr;
@@ -274,26 +351,61 @@ mod tests {
Ok((conf, tenant_id, timeline_id))
}
#[tokio::test]
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
// Helper function to slurp contents of a file, starting at the current position,
// into a string
fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
let mut buf = Vec::new();
buf.resize(len, 0u8);
efile.read_exact_at(&mut buf, offset)?;
Ok(String::from_utf8_lossy(&buf)
.trim_end_matches('\0')
.to_string())
}
#[test]
fn test_ephemeral_files() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
file_a.write_all_at(b"foo", 0)?;
assert_eq!("foo", read_string(&file_a, 0, 20)?);
file_a.write_all_at(b"bar", 3)?;
assert_eq!("foobar", read_string(&file_a, 0, 20)?);
// Open a lot of files, enough to cause some page evictions.
let mut efiles = Vec::new();
for fileno in 0..100 {
let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
efiles.push((fileno, efile));
}
// Check that all the files can still be read from. Use them in random order for
// good measure.
efiles.as_mut_slice().shuffle(&mut thread_rng());
for (fileno, efile) in efiles.iter_mut() {
assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
}
Ok(())
}
#[test]
fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
let pos_foo = file.write_blob(b"foo")?;
assert_eq!(
b"foo",
file.block_cursor().read_blob(pos_foo).await?.as_slice()
);
assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
let pos_bar = file.write_blob(b"bar")?;
assert_eq!(
b"foo",
file.block_cursor().read_blob(pos_foo).await?.as_slice()
);
assert_eq!(
b"bar",
file.block_cursor().read_blob(pos_bar).await?.as_slice()
);
assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
let mut blobs = Vec::new();
for i in 0..10000 {
@@ -308,9 +420,9 @@ mod tests {
blobs.push((pos, data));
}
let cursor = BlockCursor::new(&file);
let mut cursor = BlockCursor::new(&file);
for (pos, expected) in blobs {
let actual = cursor.read_blob(pos).await?;
let actual = cursor.read_blob(pos)?;
assert_eq!(actual, expected);
}
@@ -319,7 +431,7 @@ mod tests {
large_data.resize(20000, 0);
thread_rng().fill_bytes(&mut large_data);
let pos_large = file.write_blob(&large_data)?;
let result = file.block_cursor().read_blob(pos_large).await?;
let result = file.block_cursor().read_blob(pos_large)?;
assert_eq!(result, large_data);
Ok(())

View File

@@ -121,7 +121,7 @@ impl BatchedUpdates<'_> {
///
/// This should be called when the corresponding file on disk has been deleted.
///
pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
self.layer_map.remove_historic_noflush(layer_desc)
}
@@ -253,11 +253,11 @@ impl LayerMap {
///
/// Helper function for BatchedUpdates::remove_historic
///
pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
self.historic
.remove(historic_layer_coverage::LayerKey::from(layer_desc));
.remove(historic_layer_coverage::LayerKey::from(&layer_desc));
let layer_key = layer_desc.key();
if Self::is_l0(layer_desc) {
if Self::is_l0(&layer_desc) {
let len_before = self.l0_delta_layers.len();
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -626,17 +626,17 @@ impl LayerMap {
/// debugging function to print out the contents of the layer map
#[allow(unused)]
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!("Begin dump LayerMap");
println!("open_layer:");
if let Some(open_layer) = &self.open_layer {
open_layer.dump(verbose, ctx).await?;
open_layer.dump(verbose, ctx)?;
}
println!("frozen_layers:");
for frozen_layer in self.frozen_layers.iter() {
frozen_layer.dump(verbose, ctx).await?;
frozen_layer.dump(verbose, ctx)?;
}
println!("historic_layers:");
@@ -766,7 +766,8 @@ mod tests {
expected_in_counts
);
map.batch_update().remove_historic(downloaded.layer_desc());
map.batch_update()
.remove_historic(downloaded.layer_desc().clone());
assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
}

View File

@@ -9,11 +9,10 @@
//! [`remote_timeline_client`]: super::remote_timeline_client
use std::fs::{File, OpenOptions};
use std::io::{self, Write};
use std::io::Write;
use anyhow::{bail, ensure, Context};
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tracing::info_span;
use utils::bin_ser::SerializeError;
use utils::{
@@ -268,24 +267,24 @@ pub fn save_metadata(
Ok(())
}
#[derive(Error, Debug)]
pub enum LoadMetadataError {
#[error(transparent)]
Read(#[from] io::Error),
#[error(transparent)]
Decode(#[from] anyhow::Error),
}
pub fn load_metadata(
conf: &'static PageServerConf,
tenant_id: &TenantId,
timeline_id: &TimelineId,
) -> Result<TimelineMetadata, LoadMetadataError> {
) -> anyhow::Result<TimelineMetadata> {
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
let metadata_bytes = std::fs::read(metadata_path)?;
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
format!(
"Failed to read metadata bytes from path {}",
metadata_path.display()
)
})?;
TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
format!(
"Failed to parse metadata bytes from path {}",
metadata_path.display()
)
})
}
#[cfg(test)]

View File

@@ -20,19 +20,15 @@ use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
use utils::fs_ext::PathExt;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::timeline::delete::DeleteTimelineFlow;
/// The tenants known to the pageserver.
/// The enum variants are used to distinguish the different states that the pageserver can be in.
pub(crate) enum TenantsMap {
enum TenantsMap {
/// [`init_tenant_mgr`] is not done yet.
Initializing,
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
@@ -44,13 +40,13 @@ pub(crate) enum TenantsMap {
}
impl TenantsMap {
pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
}
}
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
@@ -99,9 +95,7 @@ pub async fn init_tenant_mgr(
);
}
} else {
// This case happens if we:
// * crash during attach before creating the attach marker file
// * crash during tenant delete before removing tenant directory
// This case happens if we crash during attach before creating the attach marker file
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
})?;
@@ -128,7 +122,6 @@ pub async fn init_tenant_mgr(
broker_client.clone(),
remote_storage.clone(),
Some(init_order.clone()),
&TENANTS,
&ctx,
) {
Ok(tenant) => {
@@ -159,13 +152,12 @@ pub async fn init_tenant_mgr(
Ok(())
}
pub(crate) fn schedule_local_tenant_processing(
pub fn schedule_local_tenant_processing(
conf: &'static PageServerConf,
tenant_path: &Path,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
init_order: Option<InitializationOrder>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
anyhow::ensure!(
@@ -201,8 +193,7 @@ pub(crate) fn schedule_local_tenant_processing(
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if let Some(remote_storage) = remote_storage {
match Tenant::spawn_attach(conf, tenant_id, broker_client, tenants, remote_storage, ctx)
{
match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -226,7 +217,6 @@ pub(crate) fn schedule_local_tenant_processing(
broker_client,
remote_storage,
init_order,
tenants,
ctx,
)
};
@@ -274,77 +264,71 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
}
};
let started_at = std::time::Instant::now();
let mut join_set = JoinSet::new();
for (tenant_id, tenant) in tenants_to_shut_down {
join_set.spawn(
async move {
let freeze_and_flush = true;
// ordering shouldn't matter for this, either we store true right away or never
let ordering = std::sync::atomic::Ordering::Relaxed;
let joined_other = std::sync::atomic::AtomicBool::new(false);
let res = {
let (_guard, shutdown_progress) = completion::channel();
tenant.shutdown(shutdown_progress, freeze_and_flush).await
let mut shutdown = std::pin::pin!(async {
let freeze_and_flush = true;
let res = {
let (_guard, shutdown_progress) = completion::channel();
tenant.shutdown(shutdown_progress, freeze_and_flush).await
};
if let Err(other_progress) = res {
// join the another shutdown in progress
joined_other.store(true, ordering);
other_progress.wait().await;
}
});
// in practice we might not have a lot time to go, since systemd is going to
// SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
// a warning.
let warning = std::time::Duration::from_secs(5);
let mut warning = std::pin::pin!(tokio::time::sleep(warning));
tokio::select! {
_ = &mut shutdown => {},
_ = &mut warning => {
let joined_other = joined_other.load(ordering);
warn!(%joined_other, "waiting for the shutdown to complete");
shutdown.await;
}
};
if let Err(other_progress) = res {
// join the another shutdown in progress
other_progress.wait().await;
}
// we cannot afford per tenant logging here, because if s3 is degraded, we are
// going to log too many lines
debug!("tenant successfully stopped");
}
.instrument(info_span!("shutdown", %tenant_id)),
);
}
let total = join_set.len();
let mut panicked = 0;
let mut buffering = true;
const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));
while !join_set.is_empty() {
tokio::select! {
Some(joined) = join_set.join_next() => {
match joined {
Ok(()) => {}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures");
}
Err(join_error) if join_error.is_panic() => {
// cannot really do anything, as this panic is likely a bug
panicked += 1;
}
Err(join_error) => {
warn!("unknown kind of JoinError: {join_error}");
}
}
if !buffering {
// buffer so that every 500ms since the first update (or starting) we'll log
// how far away we are; this is because we will get SIGKILL'd at 10s, and we
// are not able to log *then*.
buffering = true;
buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
}
},
_ = &mut buffered, if buffering => {
buffering = false;
info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
while let Some(res) = join_set.join_next().await {
match res {
Ok(()) => {}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures");
}
Err(join_error) if join_error.is_panic() => {
// cannot really do anything, as this panic is likely a bug
panicked += 1;
}
Err(join_error) => {
warn!("unknown kind of JoinError: {join_error}");
}
}
}
if panicked > 0 {
warn!(
panicked,
total, "observed panicks while shutting down tenants"
);
warn!(panicked, "observed panicks while shutting down tenants");
}
// caller will log how long we took
}
pub async fn create_tenant(
@@ -364,7 +348,7 @@ pub async fn create_tenant(
// See https://github.com/neondatabase/neon/issues/4233
let created_tenant =
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
@@ -425,14 +409,6 @@ pub async fn get_tenant(
}
}
pub async fn delete_tenant(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenant_id: TenantId,
) -> Result<(), DeleteTenantError> {
DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
}
#[derive(Debug, thiserror::Error)]
pub enum DeleteTimelineError {
#[error("Tenant {0}")]
@@ -445,10 +421,12 @@ pub enum DeleteTimelineError {
pub async fn delete_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
_ctx: &RequestContext,
ctx: &RequestContext,
) -> Result<(), DeleteTimelineError> {
let tenant = get_tenant(tenant_id, true).await?;
DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
tenant
.prepare_and_schedule_delete_timeline(timeline_id, ctx)
.await?;
Ok(())
}
@@ -523,7 +501,7 @@ pub async fn load_tenant(
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
}
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
.with_context(|| {
format!("Failed to schedule tenant processing in path {tenant_path:?}")
})?;
@@ -604,7 +582,7 @@ pub async fn attach_tenant(
.context("check for attach marker file existence")?;
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
@@ -790,6 +768,55 @@ pub async fn immediate_gc(
Ok(wait_task_done)
}
pub async fn immediate_compact(
tenant_id: TenantId,
timeline_id: TimelineId,
ctx: &RequestContext,
) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
let guard = TENANTS.read().await;
let tenant = guard
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("tenant {tenant_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))?;
// Run in task_mgr to avoid race with tenant_detach operation
let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::Compaction,
Some(tenant_id),
Some(timeline_id),
&format!(
"timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
),
false,
async move {
let result = timeline
.compact(&ctx)
.instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
.await;
match task_done.send(result) {
Ok(_) => (),
Err(result) => error!("failed to send compaction result: {result:?}"),
}
Ok(())
},
);
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
drop(guard);
Ok(wait_task_done)
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;

View File

@@ -211,9 +211,6 @@ use chrono::{NaiveDateTime, Utc};
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
use scopeguard::ScopeGuard;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use std::collections::{HashMap, VecDeque};
use std::path::Path;
@@ -222,6 +219,7 @@ use std::sync::{Arc, Mutex};
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use std::ops::DerefMut;
use tokio::runtime::Runtime;
use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
@@ -243,6 +241,7 @@ use crate::{
tenant::upload_queue::{
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
},
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
};
use utils::id::{TenantId, TimelineId};
@@ -257,12 +256,12 @@ use super::upload_queue::SetDeletedFlagProgress;
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
// level instead, as repeated failures can mean a more serious problem. If it
// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
const FAILED_DOWNLOAD_RETRIES: u32 = 10;
// Similarly log failed uploads and deletions at WARN level, after this many
// retries. Uploads and deletions are retried forever, though.
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
@@ -310,7 +309,7 @@ pub enum PersistIndexPartWithDeletedFlagError {
pub struct RemoteTimelineClient {
conf: &'static PageServerConf,
runtime: tokio::runtime::Handle,
runtime: &'static Runtime,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -337,7 +336,7 @@ impl RemoteTimelineClient {
) -> RemoteTimelineClient {
RemoteTimelineClient {
conf,
runtime: BACKGROUND_RUNTIME.handle().to_owned(),
runtime: &BACKGROUND_RUNTIME,
tenant_id,
timeline_id,
storage_impl: remote_storage,
@@ -515,7 +514,7 @@ impl RemoteTimelineClient {
/// updated metadata.
///
/// The upload will be added to the queue immediately, but it
/// won't be performed until all previously scheduled layer file
/// won't be performed until all previosuly scheduled layer file
/// upload operations have completed successfully. This is to
/// ensure that when the index file claims that layers X, Y and Z
/// exist in remote storage, they really do. To wait for the upload
@@ -626,7 +625,7 @@ impl RemoteTimelineClient {
/// Note: This schedules an index file upload before the deletions. The
/// deletion won't actually be performed, until any previously scheduled
/// upload operations, and the index file upload, have completed
/// successfully.
/// succesfully.
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerFileName],
@@ -753,24 +752,12 @@ impl RemoteTimelineClient {
pausable_failpoint!("persist_deleted_index_part");
backoff::retry(
|| async {
upload::upload_index_part(
self.conf,
&self.storage_impl,
&self.tenant_id,
&self.timeline_id,
&index_part_with_deleted_at,
)
.await
},
|_e| false,
1,
// have just a couple of attempts
// when executed as part of timeline deletion this happens in context of api call
// when executed as part of tenant deletion this happens in the background
2,
"persist_index_part_with_deleted_flag",
upload::upload_index_part(
self.conf,
&self.storage_impl,
&self.tenant_id,
&self.timeline_id,
&index_part_with_deleted_at,
)
.await?;
@@ -840,26 +827,17 @@ impl RemoteTimelineClient {
)
};
receiver.changed().await.context("upload queue shut down")?;
receiver.changed().await?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
let remaining = backoff::retry(
|| async {
self.storage_impl
.list_files(Some(&timeline_storage_path))
.await
},
|_e| false,
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"list_prefixes",
)
.await
.context("list prefixes")?;
let remaining = self
.storage_impl
.list_prefixes(Some(&timeline_storage_path))
.await?;
let remaining: Vec<RemotePath> = remaining
.into_iter()
@@ -874,42 +852,13 @@ impl RemoteTimelineClient {
.collect();
if !remaining.is_empty() {
backoff::retry(
|| async { self.storage_impl.delete_objects(&remaining).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_objects",
)
.await
.context("delete_objects")?;
self.storage_impl.delete_objects(&remaining).await?;
}
fail::fail_point!("timeline-delete-before-index-delete", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-delete"
))?
});
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
debug!("deleting index part");
backoff::retry(
|| async { self.storage_impl.delete(&index_file_path).await },
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_index",
)
.await
.context("delete_index")?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-after-index-delete"
))?
});
self.storage_impl.delete(&index_file_path).await?;
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
@@ -993,7 +942,7 @@ impl RemoteTimelineClient {
let tenant_id = self.tenant_id;
let timeline_id = self.timeline_id;
task_mgr::spawn(
&self.runtime,
self.runtime.handle(),
TaskKind::RemoteUploadTask,
Some(self.tenant_id),
Some(self.timeline_id),
@@ -1156,7 +1105,7 @@ impl RemoteTimelineClient {
debug!("remote task {} completed successfully", task.op);
}
// The task has completed successfully. Remove it from the in-progress list.
// The task has completed succesfully. Remove it from the in-progress list.
{
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
let upload_queue = match upload_queue_guard.deref_mut() {
@@ -1346,7 +1295,7 @@ mod tests {
context::RequestContext,
tenant::{
harness::{TenantHarness, TIMELINE_ID},
Tenant, Timeline,
Tenant,
},
DEFAULT_PG_VERSION,
};
@@ -1355,6 +1304,7 @@ mod tests {
collections::HashSet,
path::{Path, PathBuf},
};
use tokio::runtime::EnterGuard;
use utils::lsn::Lsn;
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1404,25 +1354,35 @@ mod tests {
}
struct TestSetup {
runtime: &'static tokio::runtime::Runtime,
entered_runtime: EnterGuard<'static>,
harness: TenantHarness,
tenant: Arc<Tenant>,
timeline: Arc<Timeline>,
tenant_ctx: RequestContext,
remote_fs_dir: PathBuf,
client: Arc<RemoteTimelineClient>,
}
impl TestSetup {
async fn new(test_name: &str) -> anyhow::Result<Self> {
fn new(test_name: &str) -> anyhow::Result<Self> {
// Use a current-thread runtime in the test
let runtime = Box::leak(Box::new(
tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?,
));
let entered_runtime = runtime.enter();
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = harness.load().await;
let (tenant, ctx) = runtime.block_on(harness.load());
// create an empty timeline directory
let timeline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let _ = runtime.block_on(tenant.create_test_timeline(
TIMELINE_ID,
Lsn(8),
DEFAULT_PG_VERSION,
&ctx,
))?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
std::fs::create_dir_all(remote_fs_dir)?;
@@ -1444,7 +1404,7 @@ mod tests {
let client = Arc::new(RemoteTimelineClient {
conf: harness.conf,
runtime: tokio::runtime::Handle::current(),
runtime,
tenant_id: harness.tenant_id,
timeline_id: TIMELINE_ID,
storage_impl: storage,
@@ -1456,9 +1416,10 @@ mod tests {
});
Ok(Self {
runtime,
entered_runtime,
harness,
tenant,
timeline,
tenant_ctx: ctx,
remote_fs_dir,
client,
@@ -1467,8 +1428,8 @@ mod tests {
}
// Test scheduling
#[tokio::test]
async fn upload_scheduling() {
#[test]
fn upload_scheduling() -> anyhow::Result<()> {
// Test outline:
//
// Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1484,26 +1445,25 @@ mod tests {
// Schedule index upload. Check that it's queued
let TestSetup {
runtime,
entered_runtime: _entered_runtime,
harness,
tenant: _tenant,
timeline: _timeline,
tenant_ctx: _tenant_ctx,
remote_fs_dir,
client,
} = TestSetup::new("upload_scheduling").await.unwrap();
} = TestSetup::new("upload_scheduling").unwrap();
let timeline_path = harness.timeline_path(&TIMELINE_ID);
println!("workdir: {}", harness.conf.workdir.display());
let remote_timeline_dir =
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
let metadata = dummy_metadata(Lsn(0x10));
client
.init_upload_queue_for_empty_remote(&metadata)
.unwrap();
client.init_upload_queue_for_empty_remote(&metadata)?;
// Create a couple of dummy files, schedule upload for them
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1512,32 +1472,26 @@ mod tests {
let content_1 = dummy_contents("foo");
let content_2 = dummy_contents("bar");
let content_3 = dummy_contents("baz");
std::fs::write(
timeline_path.join(layer_file_name_1.file_name()),
&content_1,
)?;
std::fs::write(
timeline_path.join(layer_file_name_2.file_name()),
&content_2,
)?;
std::fs::write(timeline_path.join(layer_file_name_3.file_name()), content_3)?;
for (filename, content) in [
(&layer_file_name_1, &content_1),
(&layer_file_name_2, &content_2),
(&layer_file_name_3, &content_3),
] {
std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
}
client
.schedule_layer_file_upload(
&layer_file_name_1,
&LayerFileMetadata::new(content_1.len() as u64),
)
.unwrap();
client
.schedule_layer_file_upload(
&layer_file_name_2,
&LayerFileMetadata::new(content_2.len() as u64),
)
.unwrap();
client.schedule_layer_file_upload(
&layer_file_name_1,
&LayerFileMetadata::new(content_1.len() as u64),
)?;
client.schedule_layer_file_upload(
&layer_file_name_2,
&LayerFileMetadata::new(content_2.len() as u64),
)?;
// Check that they are started immediately, not queued
//
// this works because we running within block_on, so any futures are now queued up until
// our next await point.
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
@@ -1551,9 +1505,7 @@ mod tests {
// Schedule upload of index. Check that it is queued
let metadata = dummy_metadata(Lsn(0x20));
client
.schedule_index_upload_for_metadata_update(&metadata)
.unwrap();
client.schedule_index_upload_for_metadata_update(&metadata)?;
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
@@ -1562,7 +1514,7 @@ mod tests {
}
// Wait for the uploads to finish
client.wait_completion().await.unwrap();
runtime.block_on(client.wait_completion())?;
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
@@ -1572,7 +1524,7 @@ mod tests {
}
// Download back the index.json, and check that the list of files is correct
let index_part = match client.download_index_file().await.unwrap() {
let index_part = match runtime.block_on(client.download_index_file())? {
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
};
@@ -1584,19 +1536,17 @@ mod tests {
&layer_file_name_2.file_name(),
],
);
let downloaded_metadata = index_part.parse_metadata().unwrap();
let downloaded_metadata = index_part.parse_metadata()?;
assert_eq!(downloaded_metadata, metadata);
// Schedule upload and then a deletion. Check that the deletion is queued
client
.schedule_layer_file_upload(
&layer_file_name_3,
&LayerFileMetadata::new(content_3.len() as u64),
)
.unwrap();
client
.schedule_layer_file_deletion(&[layer_file_name_1.clone()])
.unwrap();
let content_baz = dummy_contents("baz");
std::fs::write(timeline_path.join("baz"), &content_baz)?;
client.schedule_layer_file_upload(
&layer_file_name_3,
&LayerFileMetadata::new(content_baz.len() as u64),
)?;
client.schedule_layer_file_deletion(&[layer_file_name_1.clone()])?;
{
let mut guard = client.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut().unwrap();
@@ -1618,7 +1568,7 @@ mod tests {
);
// Finish them
client.wait_completion().await.unwrap();
runtime.block_on(client.wait_completion())?;
assert_remote_files(
&[
@@ -1628,24 +1578,23 @@ mod tests {
],
&remote_timeline_dir,
);
Ok(())
}
#[tokio::test]
async fn bytes_unfinished_gauge_for_layer_file_uploads() {
#[test]
fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
// Setup
let TestSetup {
runtime,
harness,
tenant: _tenant,
timeline: _timeline,
client,
..
} = TestSetup::new("metrics").await.unwrap();
} = TestSetup::new("metrics")?;
let metadata = dummy_metadata(Lsn(0x10));
client
.init_upload_queue_for_empty_remote(&metadata)
.unwrap();
client.init_upload_queue_for_empty_remote(&metadata)?;
let timeline_path = harness.timeline_path(&TIMELINE_ID);
@@ -1654,8 +1603,7 @@ mod tests {
std::fs::write(
timeline_path.join(layer_file_name_1.file_name()),
&content_1,
)
.unwrap();
)?;
#[derive(Debug, PartialEq)]
struct BytesStartedFinished {
@@ -1681,16 +1629,14 @@ mod tests {
let init = get_bytes_started_stopped();
client
.schedule_layer_file_upload(
&layer_file_name_1,
&LayerFileMetadata::new(content_1.len() as u64),
)
.unwrap();
client.schedule_layer_file_upload(
&layer_file_name_1,
&LayerFileMetadata::new(content_1.len() as u64),
)?;
let pre = get_bytes_started_stopped();
client.wait_completion().await.unwrap();
runtime.block_on(client.wait_completion())?;
let post = get_bytes_started_stopped();
@@ -1718,5 +1664,7 @@ mod tests {
finished: Some(content_1.len())
}
);
Ok(())
}
}

View File

@@ -11,17 +11,23 @@ use std::time::Duration;
use anyhow::{anyhow, Context};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use utils::{backoff, crashsafe};
use tracing::{info, warn};
use crate::config::PageServerConf;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
use remote_storage::{DownloadError, GenericRemoteStorage};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use super::index::{IndexPart, LayerFileMetadata};
use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
fs::File::open(path).await?.sync_all().await
}
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
@@ -146,7 +152,7 @@ pub async fn download_layer_file<'a>(
})
.map_err(DownloadError::Other)?;
crashsafe::fsync_async(&local_path)
fsync_path(&local_path)
.await
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
.map_err(DownloadError::Other)?;
@@ -262,6 +268,7 @@ pub(super) async fn download_index_part(
Ok(index_part)
}
///
/// Helper function to handle retries for a download operation.
///
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -269,17 +276,47 @@ pub(super) async fn download_index_part(
/// with backoff.
///
/// (See similar logic for uploads in `perform_upload_task`)
async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
where
O: FnMut() -> F,
F: Future<Output = Result<T, DownloadError>>,
{
backoff::retry(
op,
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
description,
)
.await
let mut attempts = 0;
loop {
let result = op().await;
match result {
Ok(_) => {
if attempts > 0 {
info!("{description} succeeded after {attempts} retries");
}
return result;
}
// These are "permanent" errors that should not be retried.
Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
return result;
}
// Assume that any other failure might be transient, and the operation might
// succeed if we just keep trying.
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
info!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
}
Err(DownloadError::Other(ref err)) => {
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
return result;
}
}
// sleep and retry
exponential_backoff(
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
)
.await;
attempts += 1;
}
}

View File

@@ -223,45 +223,6 @@ mod tests {
assert_eq!(part, expected);
}
#[test]
fn v2_indexpart_is_parsed_with_deleted_at() {
let example = r#"{
"version":2,
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
"missing_layers":["This shouldn't fail deserialization"],
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"deleted_at": "2023-07-31T09:00:00.123"
}"#;
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 2,
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
};
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
#[test]
fn empty_layers_are_parsed() {
let empty_layers_json = r#"{

View File

@@ -8,8 +8,8 @@ mod layer_desc;
mod remote_layer;
use crate::config::PageServerConf;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Key;
use crate::context::RequestContext;
use crate::repository::{Key, Value};
use crate::task_mgr::TaskKind;
use crate::walrecord::NeonWalRecord;
use anyhow::Result;
@@ -34,7 +34,7 @@ use utils::{
lsn::Lsn,
};
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
pub use image_layer::{ImageLayer, ImageLayerWriter};
pub use inmemory_layer::InMemoryLayer;
@@ -241,14 +241,10 @@ impl LayerAccessStats {
});
}
fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
return;
}
fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
let this_access = LayerAccessStatFullDetails {
when: SystemTime::now(),
task_kind: ctx.task_kind(),
task_kind,
access_kind,
};
@@ -256,7 +252,7 @@ impl LayerAccessStats {
locked.iter_mut().for_each(|inner| {
inner.first_access.get_or_insert(this_access);
inner.count_by_access_kind[access_kind] += 1;
inner.task_kind_flag |= ctx.task_kind();
inner.task_kind_flag |= task_kind;
inner.last_accesses.write(this_access);
})
}
@@ -342,8 +338,7 @@ impl LayerAccessStats {
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
/// timeline names, because those are known in the context of which the layers
/// are used in (timeline).
#[async_trait::async_trait]
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
/// Range of keys that this layer covers
fn get_key_range(&self) -> Range<Key>;
@@ -373,7 +368,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
/// is available. If this returns ValueReconstructResult::Continue, look up
/// the predecessor layer and call again with the same 'reconstruct_data' to
/// collect more data.
async fn get_value_reconstruct_data(
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
@@ -382,9 +377,15 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
) -> Result<ValueReconstructResult>;
/// Dump summary of the contents of the layer to stdout
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
}
/// Returned by [`PersistentLayer::iter`]
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
/// Returned by [`PersistentLayer::key_iter`]
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
/// Get a layer descriptor from a layer.
pub trait AsLayerDesc {
/// Get the layer descriptor.
@@ -405,6 +406,16 @@ pub trait AsLayerDesc {
/// An image layer is a snapshot of all the data in a key-range, at a single
/// LSN.
pub trait PersistentLayer: Layer + AsLayerDesc {
/// Identify the tenant this layer belongs to
fn get_tenant_id(&self) -> TenantId {
self.layer_desc().tenant_id
}
/// Identify the timeline this layer belongs to
fn get_timeline_id(&self) -> TimelineId {
self.layer_desc().timeline_id
}
/// File name used for this layer, both in the pageserver's local filesystem
/// state as well as in the remote storage.
fn filename(&self) -> LayerFileName {
@@ -415,6 +426,15 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
// `None` for `RemoteLayer`.
fn local_path(&self) -> Option<PathBuf>;
/// Iterate through all keys and values stored in the layer
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
/// It is used only for compaction and so is currently implemented only for DeltaLayer
fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
panic!("Not implemented")
}
/// Permanently remove this layer from disk.
fn delete_resident_layer_file(&self) -> Result<()>;
@@ -430,6 +450,14 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
false
}
/// Returns None if the layer file size is not known.
///
/// Should not change over the lifetime of the layer object because
/// current_physical_size is computed as the som of this value.
fn file_size(&self) -> u64 {
self.layer_desc().file_size
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
fn access_stats(&self) -> &LayerAccessStats;

View File

@@ -29,10 +29,10 @@
//!
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::page_cache::{PageReadGuard, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{
PersistentLayer, ValueReconstructResult, ValueReconstructState,
@@ -41,6 +41,7 @@ use crate::virtual_file::VirtualFile;
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{bail, ensure, Context, Result};
use once_cell::sync::OnceCell;
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
@@ -51,7 +52,6 @@ use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::sync::OnceCell;
use tracing::*;
use utils::{
@@ -61,8 +61,8 @@ use utils::{
};
use super::{
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
PersistentLayerDesc,
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
LayerKeyIter, PathOrConf, PersistentLayerDesc,
};
///
@@ -90,30 +90,14 @@ pub struct Summary {
impl From<&DeltaLayer> for Summary {
fn from(layer: &DeltaLayer) -> Self {
Self::expected(
layer.desc.tenant_id,
layer.desc.timeline_id,
layer.desc.key_range.clone(),
layer.desc.lsn_range.clone(),
)
}
}
impl Summary {
pub(super) fn expected(
tenant_id: TenantId,
timeline_id: TimelineId,
keys: Range<Key>,
lsns: Range<Lsn>,
) -> Self {
Self {
magic: DELTA_FILE_MAGIC,
format_version: STORAGE_FORMAT_VERSION,
tenant_id,
timeline_id,
key_range: keys,
lsn_range: lsns,
tenant_id: layer.desc.tenant_id,
timeline_id: layer.desc.timeline_id,
key_range: layer.desc.key_range.clone(),
lsn_range: layer.desc.lsn_range.clone(),
index_start_blk: 0,
index_root_blk: 0,
@@ -124,10 +108,12 @@ impl Summary {
// Flag indicating that this version initialize the page
const WILL_INIT: u64 = 1;
///
/// Struct representing reference to BLOB in layers. Reference contains BLOB
/// offset, and for WAL records it also contains `will_init` flag. The flag
/// helps to determine the range of records that needs to be applied, without
/// reading/deserializing records themselves.
///
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
pub struct BlobRef(pub u64);
@@ -152,8 +138,10 @@ impl BlobRef {
pub const DELTA_KEY_SIZE: usize = KEY_SIZE + 8;
struct DeltaKey([u8; DELTA_KEY_SIZE]);
///
/// This is the key of the B-tree index stored in the delta layer. It consists
/// of the serialized representation of a Key and LSN.
///
impl DeltaKey {
fn from_slice(buf: &[u8]) -> Self {
let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE];
@@ -176,6 +164,10 @@ impl DeltaKey {
Lsn(u64::from_be_bytes(self.0[KEY_SIZE..].try_into().unwrap()))
}
fn extract_key_from_buf(buf: &[u8]) -> Key {
Key::from_slice(&buf[..KEY_SIZE])
}
fn extract_lsn_from_buf(buf: &[u8]) -> Lsn {
let mut lsn_buf = [0u8; 8];
lsn_buf.copy_from_slice(&buf[KEY_SIZE..]);
@@ -197,7 +189,7 @@ pub struct DeltaLayer {
access_stats: LayerAccessStats,
inner: OnceCell<Arc<DeltaLayerInner>>,
inner: OnceCell<DeltaLayerInner>,
}
impl std::fmt::Debug for DeltaLayer {
@@ -222,12 +214,6 @@ pub struct DeltaLayerInner {
file: FileBlockReader<VirtualFile>,
}
impl AsRef<DeltaLayerInner> for DeltaLayerInner {
fn as_ref(&self) -> &DeltaLayerInner {
self
}
}
impl std::fmt::Debug for DeltaLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DeltaLayerInner")
@@ -237,10 +223,9 @@ impl std::fmt::Debug for DeltaLayerInner {
}
}
#[async_trait::async_trait]
impl Layer for DeltaLayer {
/// debugging function to print out the contents of the layer
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!(
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
self.desc.tenant_id,
@@ -256,7 +241,7 @@ impl Layer for DeltaLayer {
return Ok(());
}
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
let inner = self.load(LayerAccessKind::Dump, ctx)?;
println!(
"index_start_blk: {}, root {}",
@@ -270,49 +255,52 @@ impl Layer for DeltaLayer {
file,
);
tree_reader.dump().await?;
tree_reader.dump()?;
let keys = DeltaLayerInner::load_keys(&Ref(&**inner)).await?;
let mut cursor = file.block_cursor();
// A subroutine to dump a single blob
let dump_blob = |val: ValueRef<_>| -> _ {
async move {
let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
format!(" img {} bytes", img.len())
}
Value::WalRecord(rec) => {
let wal_desc = walrecord::describe_wal_record(&rec)?;
format!(
" rec {} bytes will_init: {} {}",
buf.len(),
rec.will_init(),
wal_desc
)
}
};
Ok(desc)
}
};
for entry in keys {
let DeltaEntry { key, lsn, val, .. } = entry;
let desc = match dump_blob(val).await {
Ok(desc) => desc,
Err(err) => {
let err: anyhow::Error = err;
format!("ERROR: {err}")
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
let buf = cursor.read_blob(blob_ref.pos())?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
format!(" img {} bytes", img.len())
}
Value::WalRecord(rec) => {
let wal_desc = walrecord::describe_wal_record(&rec)?;
format!(
" rec {} bytes will_init: {} {}",
buf.len(),
rec.will_init(),
wal_desc
)
}
};
println!(" key {key} at {lsn}: {desc}");
}
Ok(desc)
};
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|delta_key, val| {
let blob_ref = BlobRef(val);
let key = DeltaKey::extract_key_from_buf(delta_key);
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
let desc = match dump_blob(blob_ref) {
Ok(desc) => desc,
Err(err) => format!("ERROR: {}", err),
};
println!(" key {} at {}: {}", key, lsn, desc);
true
},
)?;
Ok(())
}
async fn get_value_reconstruct_data(
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
@@ -320,15 +308,82 @@ impl Layer for DeltaLayer {
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.desc.lsn_range.start);
let mut need_image = true;
ensure!(self.desc.key_range.contains(&key));
let inner = self
.load(LayerAccessKind::GetValueReconstructData, ctx)
.await?;
inner
.get_value_reconstruct_data(key, lsn_range, reconstruct_state)
.await
{
// Open the file and lock the metadata in memory
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
// Scan the page versions backwards, starting from `lsn`.
let file = &inner.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
file,
);
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
let blob_ref = BlobRef(value);
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
return false;
}
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
if entry_lsn < lsn_range.start {
return false;
}
offsets.push((entry_lsn, blob_ref.pos()));
!blob_ref.will_init()
})?;
// Ok, 'offsets' now contains the offsets of all the entries we need to read
let mut cursor = file.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
format!(
"Failed to read blob from virtual file {}",
file.file.path.display()
)
})?;
let val = Value::des(&buf).with_context(|| {
format!(
"Failed to deserialize file blob from virtual file {}",
file.file.path.display()
)
})?;
match val {
Value::Image(img) => {
reconstruct_state.img = Some((entry_lsn, img));
need_image = false;
break;
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
reconstruct_state.records.push((entry_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
// release metadata lock and close the file
}
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(ValueReconstructResult::Continue)
} else {
Ok(ValueReconstructResult::Complete)
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -368,6 +423,23 @@ impl PersistentLayer for DeltaLayer {
Some(self.path())
}
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
let inner = self
.load(LayerAccessKind::KeyIter, ctx)
.context("load delta layer")?;
Ok(match DeltaValueIter::new(inner) {
Ok(iter) => Box::new(iter),
Err(err) => Box::new(std::iter::once(Err(err))),
})
}
fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
Ok(Box::new(
DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
))
}
fn delete_resident_layer_file(&self) -> Result<()> {
// delete underlying file
fs::remove_file(self.path())?;
@@ -437,43 +509,55 @@ impl DeltaLayer {
/// Open the underlying file and read the metadata into memory, if it's
/// not loaded already.
///
async fn load(
&self,
access_kind: LayerAccessKind,
ctx: &RequestContext,
) -> Result<&Arc<DeltaLayerInner>> {
self.access_stats.record_access(access_kind, ctx);
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
self.access_stats
.record_access(access_kind, ctx.task_kind());
// Quick exit if already loaded
self.inner
.get_or_try_init(|| self.load_inner())
.await
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
}
async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
fn load_inner(&self) -> Result<DeltaLayerInner> {
let path = self.path();
let summary = match &self.path_or_conf {
PathOrConf::Conf(_) => Some(Summary::from(self)),
PathOrConf::Path(_) => None,
};
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let loaded = DeltaLayerInner::load(&path, summary)?;
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
match &self.path_or_conf {
PathOrConf::Conf(_) => {
let mut expected_summary = Summary::from(self);
expected_summary.index_start_blk = actual_summary.index_start_blk;
expected_summary.index_root_blk = actual_summary.index_root_blk;
if actual_summary != expected_summary {
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
}
}
PathOrConf::Path(path) => {
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
let expected_filename = self.filename().file_name();
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
let expected_filename = self.filename().file_name();
if actual_filename != expected_filename {
println!("warning: filename does not match what is expected from in-file summary");
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
if actual_filename != expected_filename {
println!(
"warning: filename does not match what is expected from in-file summary"
);
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
}
}
}
Ok(Arc::new(loaded))
debug!("loaded from {}", &path.display());
Ok(DeltaLayerInner {
file,
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
})
}
/// Create a DeltaLayer struct representing an existing file on disk.
@@ -495,7 +579,7 @@ impl DeltaLayer {
file_size,
),
access_stats,
inner: OnceCell::new(),
inner: once_cell::sync::OnceCell::new(),
}
}
@@ -522,7 +606,7 @@ impl DeltaLayer {
metadata.len(),
),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: OnceCell::new(),
inner: once_cell::sync::OnceCell::new(),
})
}
@@ -538,23 +622,6 @@ impl DeltaLayer {
&self.layer_name(),
)
}
/// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
///
/// The value can be obtained via the [`ValueRef::load`] function.
pub(crate) async fn load_keys(
&self,
ctx: &RequestContext,
) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
let inner = self
.load(LayerAccessKind::KeyIter, ctx)
.await
.context("load delta layer keys")?;
let inner = Ref(&**inner);
DeltaLayerInner::load_keys(&inner)
.await
.context("Layer index is corrupted")
}
}
/// A builder object for constructing a new delta layer.
@@ -690,17 +757,6 @@ impl DeltaLayerWriterInner {
.metadata()
.context("get file metadata to determine size")?;
// 5GB limit for objects without multipart upload (which we don't want to use)
// Make it a little bit below to account for differing GB units
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
ensure!(
metadata.len() <= S3_UPLOAD_LIMIT,
"Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
file.path.display(),
metadata.len()
);
// Note: Because we opened the file in write-only mode, we cannot
// reuse the same VirtualFile for reading later. That's why we don't
// set inner.file here. The first read will have to re-open it.
@@ -714,7 +770,7 @@ impl DeltaLayerWriterInner {
metadata.len(),
),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: OnceCell::new(),
inner: once_cell::sync::OnceCell::new(),
};
// fsync the file
@@ -836,213 +892,168 @@ impl Drop for DeltaLayerWriter {
}
}
impl DeltaLayerInner {
pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
///
/// Iterator over all key-value pairse stored in a delta layer
///
/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
/// That takes up quite a lot of memory. Should do this in a more streaming
/// fashion.
///
struct DeltaValueIter<'a> {
all_offsets: Vec<(DeltaKey, BlobRef)>,
next_idx: usize,
reader: BlockCursor<Adapter<'a>>,
}
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
struct Adapter<'a>(&'a DeltaLayerInner);
if let Some(mut expected_summary) = summary {
// production code path
expected_summary.index_start_blk = actual_summary.index_start_blk;
expected_summary.index_root_blk = actual_summary.index_root_blk;
if actual_summary != expected_summary {
bail!(
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
actual_summary,
expected_summary
);
}
}
impl<'a> BlockReader for Adapter<'a> {
type BlockLease = PageReadGuard<'static>;
Ok(DeltaLayerInner {
file,
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
})
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
self.0.file.read_blk(blknum)
}
}
pub(super) async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
) -> anyhow::Result<ValueReconstructResult> {
let mut need_image = true;
// Scan the page versions backwards, starting from `lsn`.
let file = &self.file;
impl<'a> Iterator for DeltaValueIter<'a> {
type Item = Result<(Key, Lsn, Value)>;
fn next(&mut self) -> Option<Self::Item> {
self.next_res().transpose()
}
}
impl<'a> DeltaValueIter<'a> {
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
let file = &inner.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
self.index_start_blk,
self.index_root_blk,
inner.index_start_blk,
inner.index_root_blk,
file,
);
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, value| {
all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
true
},
)?;
tree_reader
.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
let blob_ref = BlobRef(value);
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
return false;
}
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
if entry_lsn < lsn_range.start {
return false;
}
offsets.push((entry_lsn, blob_ref.pos()));
let iter = DeltaValueIter {
all_offsets,
next_idx: 0,
reader: BlockCursor::new(Adapter(inner)),
};
!blob_ref.will_init()
})
.await?;
Ok(iter)
}
// Ok, 'offsets' now contains the offsets of all the entries we need to read
let cursor = file.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
cursor
.read_blob_into_buf(pos, &mut buf)
.await
.with_context(|| {
format!(
"Failed to read blob from virtual file {}",
file.file.path.display()
)
})?;
let val = Value::des(&buf).with_context(|| {
format!(
"Failed to deserialize file blob from virtual file {}",
file.file.path.display()
)
})?;
match val {
Value::Image(img) => {
reconstruct_state.img = Some((entry_lsn, img));
need_image = false;
break;
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
reconstruct_state.records.push((entry_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
if self.next_idx < self.all_offsets.len() {
let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(ValueReconstructResult::Continue)
let key = delta_key.key();
let lsn = delta_key.lsn();
let buf = self.reader.read_blob(blob_ref.pos())?;
let val = Value::des(&buf)?;
self.next_idx += 1;
Ok(Some((key, lsn, val)))
} else {
Ok(ValueReconstructResult::Complete)
Ok(None)
}
}
pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
this: &T,
) -> Result<Vec<DeltaEntry<T>>> {
let dl = this.as_ref();
let file = &dl.file;
let tree_reader =
DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
tree_reader
.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, value| {
let delta_key = DeltaKey::from_slice(key);
let val_ref = ValueRef {
blob_ref: BlobRef(value),
reader: BlockCursor::new(Adapter(this.clone())),
};
let pos = BlobRef(value).pos();
if let Some(last) = all_keys.last_mut() {
// subtract offset of the current and last entries to get the size
// of the value associated with this (key, lsn) tuple
let first_pos = last.size;
last.size = pos - first_pos;
}
let entry = DeltaEntry {
key: delta_key.key(),
lsn: delta_key.lsn(),
size: pos,
val: val_ref,
};
all_keys.push(entry);
true
},
)
.await?;
if let Some(last) = all_keys.last_mut() {
// Last key occupies all space till end of value storage,
// which corresponds to beginning of the index
last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
}
Ok(all_keys)
}
}
/// Cloneable borrow wrapper to make borrows behave like smart pointers.
///
/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
/// cloning DeltaLayerInner.
pub(crate) struct Ref<T>(T);
/// Iterator over all keys stored in a delta layer
///
/// FIXME: This creates a Vector to hold all keys.
/// That takes up quite a lot of memory. Should do this in a more streaming
/// fashion.
///
struct DeltaKeyIter {
all_keys: Vec<(DeltaKey, u64)>,
next_idx: usize,
}
impl<'a, T> AsRef<T> for Ref<&'a T> {
fn as_ref(&self) -> &T {
self.0
impl Iterator for DeltaKeyIter {
type Item = (Key, Lsn, u64);
fn next(&mut self) -> Option<Self::Item> {
if self.next_idx < self.all_keys.len() {
let (delta_key, size) = &self.all_keys[self.next_idx];
let key = delta_key.key();
let lsn = delta_key.lsn();
self.next_idx += 1;
Some((key, lsn, *size))
} else {
None
}
}
}
impl<'a, T> Clone for Ref<&'a T> {
fn clone(&self) -> Self {
*self
impl<'a> DeltaKeyIter {
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
let file = &inner.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
file,
);
let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
tree_reader.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|key, value| {
let delta_key = DeltaKey::from_slice(key);
let pos = BlobRef(value).pos();
if let Some(last) = all_keys.last_mut() {
if last.0.key() == delta_key.key() {
return true;
} else {
// subtract offset of new key BLOB and first blob of this key
// to get total size if values associated with this key
let first_pos = last.1;
last.1 = pos - first_pos;
}
}
all_keys.push((delta_key, pos));
true
},
)?;
if let Some(last) = all_keys.last_mut() {
// Last key occupies all space till end of layer
last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
}
let iter = DeltaKeyIter {
all_keys,
next_idx: 0,
};
Ok(iter)
}
}
impl<'a, T> Copy for Ref<&'a T> {}
#[cfg(test)]
mod test {
use super::DeltaKeyIter;
use super::DeltaLayer;
use super::DeltaValueIter;
/// A set of data associated with a delta layer key and its value
pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
pub key: Key,
pub lsn: Lsn,
/// Size of the stored value
pub size: u64,
/// Reference to the on-disk value
pub val: ValueRef<T>,
}
/// Reference to an on-disk value
pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
blob_ref: BlobRef,
reader: BlockCursor<Adapter<T>>,
}
impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
/// Loads the value from disk
pub async fn load(&self) -> Result<Value> {
// theoretically we *could* record an access time for each, but it does not really matter
let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
let val = Value::des(&buf)?;
Ok(val)
}
}
struct Adapter<T: AsRef<DeltaLayerInner>>(T);
impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.0.as_ref().file.read_blk(blknum)
// We will soon need the iters to be send in the compaction code.
// Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
// Cf https://github.com/neondatabase/neon/issues/4471
#[test]
fn is_send() {
fn assert_send<T: Send>() {}
assert_send::<DeltaLayer>();
assert_send::<DeltaValueIter>();
assert_send::<DeltaKeyIter>();
}
}

View File

@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, KEY_SIZE};
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{
@@ -47,7 +47,7 @@ use std::io::{Seek, SeekFrom};
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::path::{Path, PathBuf};
use tokio::sync::OnceCell;
use std::sync::{RwLock, RwLockReadGuard};
use tracing::*;
use utils::{
@@ -57,7 +57,9 @@ use utils::{
};
use super::filename::ImageFileName;
use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
use super::{
AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
};
///
/// Header stored in the beginning of the file
@@ -66,7 +68,7 @@ use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLay
/// the 'index' starts at the block indicated by 'index_start_blk'
///
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
pub(super) struct Summary {
struct Summary {
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
magic: u16,
format_version: u16,
@@ -85,29 +87,13 @@ pub(super) struct Summary {
impl From<&ImageLayer> for Summary {
fn from(layer: &ImageLayer) -> Self {
Self::expected(
layer.desc.tenant_id,
layer.desc.timeline_id,
layer.desc.key_range.clone(),
layer.lsn,
)
}
}
impl Summary {
pub(super) fn expected(
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn: Lsn,
) -> Self {
Self {
magic: IMAGE_FILE_MAGIC,
format_version: STORAGE_FORMAT_VERSION,
tenant_id,
timeline_id,
key_range,
lsn,
tenant_id: layer.desc.tenant_id,
timeline_id: layer.desc.timeline_id,
key_range: layer.desc.key_range.clone(),
lsn: layer.lsn,
index_start_blk: 0,
index_root_blk: 0,
@@ -131,7 +117,7 @@ pub struct ImageLayer {
access_stats: LayerAccessStats,
inner: OnceCell<ImageLayerInner>,
inner: RwLock<ImageLayerInner>,
}
impl std::fmt::Debug for ImageLayer {
@@ -148,29 +134,30 @@ impl std::fmt::Debug for ImageLayer {
}
pub struct ImageLayerInner {
/// If false, the 'index' has not been loaded into memory yet.
loaded: bool,
// values copied from summary
index_start_blk: u32,
index_root_blk: u32,
lsn: Lsn,
/// Reader object for reading blocks from the file.
file: FileBlockReader<VirtualFile>,
/// Reader object for reading blocks from the file. (None if not loaded yet)
file: Option<FileBlockReader<VirtualFile>>,
}
impl std::fmt::Debug for ImageLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ImageLayerInner")
.field("loaded", &self.loaded)
.field("index_start_blk", &self.index_start_blk)
.field("index_root_blk", &self.index_root_blk)
.finish()
}
}
#[async_trait::async_trait]
impl Layer for ImageLayer {
/// debugging function to print out the contents of the layer
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!(
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
self.desc.tenant_id,
@@ -186,25 +173,23 @@ impl Layer for ImageLayer {
return Ok(());
}
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
let file = &inner.file;
let inner = self.load(LayerAccessKind::Dump, ctx)?;
let file = inner.file.as_ref().unwrap();
let tree_reader =
DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
tree_reader.dump().await?;
tree_reader.dump()?;
tree_reader
.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
println!("key: {} offset {}", hex::encode(key), value);
true
})
.await?;
tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
println!("key: {} offset {}", hex::encode(key), value);
true
})?;
Ok(())
}
/// Look up given page in the file
async fn get_value_reconstruct_data(
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
@@ -215,14 +200,28 @@ impl Layer for ImageLayer {
assert!(lsn_range.start >= self.lsn);
assert!(lsn_range.end >= self.lsn);
let inner = self
.load(LayerAccessKind::GetValueReconstructData, ctx)
.await?;
inner
.get_value_reconstruct_data(key, reconstruct_state)
.await
// FIXME: makes no sense to dump paths
.with_context(|| format!("read {}", self.path().display()))
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
if let Some(offset) = tree_reader.get(&keybuf)? {
let blob = file.block_cursor().read_blob(offset).with_context(|| {
format!(
"failed to read value from data file {} at offset {}",
self.path().display(),
offset
)
})?;
let value = Bytes::from(blob);
reconstruct_state.img = Some((self.lsn, value));
Ok(ValueReconstructResult::Complete)
} else {
Ok(ValueReconstructResult::Missing)
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -259,6 +258,10 @@ impl PersistentLayer for ImageLayer {
Some(self.path())
}
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
unimplemented!();
}
fn delete_resident_layer_file(&self) -> Result<()> {
// delete underlying file
fs::remove_file(self.path())?;
@@ -318,41 +321,83 @@ impl ImageLayer {
/// Open the underlying file and read the metadata into memory, if it's
/// not loaded already.
///
async fn load(
fn load(
&self,
access_kind: LayerAccessKind,
ctx: &RequestContext,
) -> Result<&ImageLayerInner> {
self.access_stats.record_access(access_kind, ctx);
self.inner
.get_or_try_init(|| self.load_inner())
.await
.with_context(|| format!("Failed to load image layer {}", self.path().display()))
) -> Result<RwLockReadGuard<ImageLayerInner>> {
self.access_stats
.record_access(access_kind, ctx.task_kind());
loop {
// Quick exit if already loaded
let inner = self.inner.read().unwrap();
if inner.loaded {
return Ok(inner);
}
// Need to open the file and load the metadata. Upgrade our lock to
// a write lock. (Or rather, release and re-lock in write mode.)
drop(inner);
let mut inner = self.inner.write().unwrap();
if !inner.loaded {
self.load_inner(&mut inner).with_context(|| {
format!("Failed to load image layer {}", self.path().display())
})?
} else {
// Another thread loaded it while we were not holding the lock.
}
// We now have the file open and loaded. There's no function to do
// that in the std library RwLock, so we have to release and re-lock
// in read mode. (To be precise, the lock guard was moved in the
// above call to `load_inner`, so it's already been released). And
// while we do that, another thread could unload again, so we have
// to re-check and retry if that happens.
drop(inner);
}
}
async fn load_inner(&self) -> Result<ImageLayerInner> {
fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
let path = self.path();
let expected_summary = match &self.path_or_conf {
PathOrConf::Conf(_) => Some(Summary::from(self)),
PathOrConf::Path(_) => None,
};
// Open the file if it's not open already.
if inner.file.is_none() {
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
inner.file = Some(FileBlockReader::new(file));
}
let file = inner.file.as_mut().unwrap();
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
match &self.path_or_conf {
PathOrConf::Conf(_) => {
let mut expected_summary = Summary::from(self);
expected_summary.index_start_blk = actual_summary.index_start_blk;
expected_summary.index_root_blk = actual_summary.index_root_blk;
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
let expected_filename = self.filename().file_name();
if actual_summary != expected_summary {
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
}
}
PathOrConf::Path(path) => {
let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
let expected_filename = self.filename().file_name();
if actual_filename != expected_filename {
println!("warning: filename does not match what is expected from in-file summary");
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
if actual_filename != expected_filename {
println!(
"warning: filename does not match what is expected from in-file summary"
);
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
}
}
}
Ok(loaded)
inner.index_start_blk = actual_summary.index_start_blk;
inner.index_root_blk = actual_summary.index_root_blk;
inner.loaded = true;
Ok(())
}
/// Create an ImageLayer struct representing an existing file on disk
@@ -376,7 +421,12 @@ impl ImageLayer {
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: filename.lsn,
access_stats,
inner: OnceCell::new(),
inner: RwLock::new(ImageLayerInner {
loaded: false,
file: None,
index_start_blk: 0,
index_root_blk: 0,
}),
}
}
@@ -403,7 +453,12 @@ impl ImageLayer {
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: summary.lsn,
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: OnceCell::new(),
inner: RwLock::new(ImageLayerInner {
file: None,
loaded: false,
index_start_blk: 0,
index_root_blk: 0,
}),
})
}
@@ -422,66 +477,6 @@ impl ImageLayer {
}
}
impl ImageLayerInner {
pub(super) fn load(
path: &std::path::Path,
lsn: Lsn,
summary: Option<Summary>,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
// production code path
expected_summary.index_start_blk = actual_summary.index_start_blk;
expected_summary.index_root_blk = actual_summary.index_root_blk;
if actual_summary != expected_summary {
bail!(
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
actual_summary,
expected_summary
);
}
}
Ok(ImageLayerInner {
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
lsn,
file,
})
}
pub(super) async fn get_value_reconstruct_data(
&self,
key: Key,
reconstruct_state: &mut ValueReconstructState,
) -> anyhow::Result<ValueReconstructResult> {
let file = &self.file;
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
if let Some(offset) = tree_reader.get(&keybuf).await? {
let blob = file
.block_cursor()
.read_blob(offset)
.await
.with_context(|| format!("failed to read value from offset {}", offset))?;
let value = Bytes::from(blob);
reconstruct_state.img = Some((self.lsn, value));
Ok(ValueReconstructResult::Complete)
} else {
Ok(ValueReconstructResult::Missing)
}
}
}
/// A builder object for constructing a new image layer.
///
/// Usage:
@@ -624,7 +619,12 @@ impl ImageLayerWriterInner {
desc,
lsn: self.lsn,
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: OnceCell::new(),
inner: RwLock::new(ImageLayerInner {
loaded: false,
file: None,
index_start_blk,
index_root_blk,
}),
};
// fsync the file

View File

@@ -7,7 +7,7 @@
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::repository::{Key, Value};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
use crate::tenant::block_io::BlockReader;
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
@@ -16,7 +16,6 @@ use anyhow::{ensure, Result};
use pageserver_api::models::InMemoryLayerInfo;
use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::OnceLock;
use tracing::*;
use utils::{
bin_ser::BeSer,
@@ -28,7 +27,7 @@ use utils::{
// while being able to use std::fmt::Write's methods
use std::fmt::Write as _;
use std::ops::Range;
use tokio::sync::RwLock;
use std::sync::RwLock;
use super::{DeltaLayer, DeltaLayerWriter, Layer};
@@ -43,16 +42,14 @@ pub struct InMemoryLayer {
tenant_id: TenantId,
timeline_id: TimelineId,
///
/// This layer contains all the changes from 'start_lsn'. The
/// start is inclusive.
///
start_lsn: Lsn,
/// Frozen layers have an exclusive end LSN.
/// Writes are only allowed when this is `None`.
end_lsn: OnceLock<Lsn>,
/// The above fields never change, except for `end_lsn`, which is only set once.
/// All other changing parts are in `inner`, and protected by a mutex.
/// The above fields never change. The parts that do change are in 'inner',
/// and protected by mutex.
inner: RwLock<InMemoryLayerInner>,
}
@@ -60,16 +57,21 @@ impl std::fmt::Debug for InMemoryLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("InMemoryLayer")
.field("start_lsn", &self.start_lsn)
.field("end_lsn", &self.end_lsn)
.field("inner", &self.inner)
.finish()
}
}
pub struct InMemoryLayerInner {
/// Frozen layers have an exclusive end LSN.
/// Writes are only allowed when this is None
end_lsn: Option<Lsn>,
///
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The value is an offset into the
/// ephemeral file where the page version is stored.
///
index: HashMap<Key, VecMap<Lsn, u64>>,
/// The values are stored in a serialized format in this file.
@@ -80,7 +82,15 @@ pub struct InMemoryLayerInner {
impl std::fmt::Debug for InMemoryLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("InMemoryLayerInner").finish()
f.debug_struct("InMemoryLayerInner")
.field("end_lsn", &self.end_lsn)
.finish()
}
}
impl InMemoryLayerInner {
fn assert_writeable(&self) {
assert!(self.end_lsn.is_none());
}
}
@@ -91,31 +101,29 @@ impl InMemoryLayer {
pub fn info(&self) -> InMemoryLayerInfo {
let lsn_start = self.start_lsn;
let lsn_end = self.inner.read().unwrap().end_lsn;
if let Some(&lsn_end) = self.end_lsn.get() {
InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
} else {
InMemoryLayerInfo::Open { lsn_start }
match lsn_end {
Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
None => InMemoryLayerInfo::Open { lsn_start },
}
}
fn assert_writable(&self) {
assert!(self.end_lsn.get().is_none());
}
fn end_lsn_or_max(&self) -> Lsn {
self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
}
}
#[async_trait::async_trait]
impl Layer for InMemoryLayer {
fn get_key_range(&self) -> Range<Key> {
Key::MIN..Key::MAX
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.start_lsn..self.end_lsn_or_max()
let inner = self.inner.read().unwrap();
let end_lsn = if let Some(end_lsn) = inner.end_lsn {
end_lsn
} else {
Lsn(u64::MAX)
};
self.start_lsn..end_lsn
}
fn is_incremental(&self) -> bool {
@@ -124,10 +132,14 @@ impl Layer for InMemoryLayer {
}
/// debugging function to print out the contents of the layer
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
let inner = self.inner.read().await;
fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
let inner = self.inner.read().unwrap();
let end_str = self.end_lsn_or_max();
let end_str = inner
.end_lsn
.as_ref()
.map(Lsn::to_string)
.unwrap_or_default();
println!(
"----- in-memory layer for tli {} LSNs {}-{} ----",
@@ -138,12 +150,12 @@ impl Layer for InMemoryLayer {
return Ok(());
}
let cursor = inner.file.block_cursor();
let mut cursor = inner.file.block_cursor();
let mut buf = Vec::new();
for (key, vec_map) in inner.index.iter() {
for (lsn, pos) in vec_map.as_slice() {
let mut desc = String::new();
cursor.read_blob_into_buf(*pos, &mut buf).await?;
cursor.read_blob_into_buf(*pos, &mut buf)?;
let val = Value::des(&buf);
match val {
Ok(Value::Image(img)) => {
@@ -171,7 +183,7 @@ impl Layer for InMemoryLayer {
}
/// Look up given value in the layer.
async fn get_value_reconstruct_data(
fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
@@ -181,15 +193,15 @@ impl Layer for InMemoryLayer {
ensure!(lsn_range.start >= self.start_lsn);
let mut need_image = true;
let inner = self.inner.read().await;
let inner = self.inner.read().unwrap();
let reader = inner.file.block_cursor();
let mut reader = inner.file.block_cursor();
// Scan the page versions backwards, starting from `lsn`.
if let Some(vec_map) = inner.index.get(&key) {
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
let buf = reader.read_blob(*pos).await?;
let buf = reader.read_blob(*pos)?;
let value = Value::des(&buf)?;
match value {
Value::Image(img) => {
@@ -223,18 +235,20 @@ impl Layer for InMemoryLayer {
impl std::fmt::Display for InMemoryLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let end_lsn = self.end_lsn_or_max();
let inner = self.inner.read().unwrap();
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
}
}
impl InMemoryLayer {
///
/// Get layer size.
/// Get layer size on the disk
///
pub async fn size(&self) -> Result<u64> {
let inner = self.inner.read().await;
Ok(inner.file.size())
pub fn size(&self) -> Result<u64> {
let inner = self.inner.read().unwrap();
Ok(inner.file.size)
}
///
@@ -255,8 +269,8 @@ impl InMemoryLayer {
timeline_id,
tenant_id,
start_lsn,
end_lsn: OnceLock::new(),
inner: RwLock::new(InMemoryLayerInner {
end_lsn: None,
index: HashMap::new(),
file,
}),
@@ -267,10 +281,10 @@ impl InMemoryLayer {
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let mut inner = self.inner.write().await;
self.assert_writable();
let mut inner = self.inner.write().unwrap();
inner.assert_writeable();
let off = {
SER_BUFFER.with(|x| -> Result<_> {
@@ -301,11 +315,11 @@ impl InMemoryLayer {
/// Make the layer non-writeable. Only call once.
/// Records the end_lsn for non-dropped layers.
/// `end_lsn` is exclusive
pub async fn freeze(&self, end_lsn: Lsn) {
let inner = self.inner.write().await;
pub fn freeze(&self, end_lsn: Lsn) {
let mut inner = self.inner.write().unwrap();
assert!(self.start_lsn < end_lsn);
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
inner.end_lsn = Some(end_lsn);
for vec_map in inner.index.values() {
for (lsn, _pos) in vec_map.as_slice() {
@@ -317,7 +331,7 @@ impl InMemoryLayer {
/// Write this frozen in-memory layer to disk.
///
/// Returns a new delta layer with all the same data as this in-memory layer
pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
pub fn write_to_disk(&self) -> Result<DeltaLayer> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception
@@ -327,21 +341,19 @@ impl InMemoryLayer {
// lock, it will see that it's not writeable anymore and retry, but it
// would have to wait until we release it. That race condition is very
// rare though, so we just accept the potential latency hit for now.
let inner = self.inner.read().await;
let end_lsn = *self.end_lsn.get().unwrap();
let inner = self.inner.read().unwrap();
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_id,
Key::MIN,
self.start_lsn..end_lsn,
self.start_lsn..inner.end_lsn.unwrap(),
)?;
let mut buf = Vec::new();
let cursor = inner.file.block_cursor();
let mut cursor = inner.file.block_cursor();
let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
keys.sort_by_key(|k| k.0);
@@ -350,7 +362,7 @@ impl InMemoryLayer {
let key = **key;
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf).await?;
cursor.read_blob_into_buf(*pos, &mut buf)?;
let will_init = Value::des(&buf)?.will_init();
delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
}

View File

@@ -20,8 +20,8 @@ use utils::{
use super::filename::{DeltaFileName, ImageFileName};
use super::{
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
};
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -65,9 +65,8 @@ impl std::fmt::Debug for RemoteLayer {
}
}
#[async_trait::async_trait]
impl Layer for RemoteLayer {
async fn get_value_reconstruct_data(
fn get_value_reconstruct_data(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
@@ -78,7 +77,7 @@ impl Layer for RemoteLayer {
}
/// debugging function to print out the contents of the layer
async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
println!(
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
self.desc.tenant_id,
@@ -129,6 +128,14 @@ impl PersistentLayer for RemoteLayer {
None
}
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
bail!("cannot iterate a remote layer");
}
fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
bail!("cannot iterate a remote layer");
}
fn delete_resident_layer_file(&self) -> Result<()> {
bail!("remote layer has no layer file");
}

View File

@@ -73,13 +73,17 @@ pub fn start_background_loops(
///
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let wait_duration = Duration::from_secs(2);
info!("starting");
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
let mut first = true;
loop {
trace!("waking up");
tokio::select! {
_ = cancel.cancelled() => {
info!("received cancellation request");
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -107,7 +111,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
Duration::from_secs(10)
} else {
// Run compaction
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
if let Err(e) = tenant.compaction_iteration(&ctx).await {
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
wait_duration
} else {
@@ -122,12 +126,15 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
.await
.is_ok()
{
info!("received cancellation request during idling");
break;
}
}
}
.await;
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
trace!("compaction loop stopped.");
}
///
@@ -135,6 +142,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
///
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let wait_duration = Duration::from_secs(2);
info!("starting");
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
// GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -143,8 +151,11 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let mut first = true;
loop {
trace!("waking up");
tokio::select! {
_ = cancel.cancelled() => {
info!("received cancellation request");
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -189,12 +200,14 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
.await
.is_ok()
{
info!("received cancellation request during idling");
break;
}
}
}
.await;
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
trace!("GC loop stopped.");
}
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
@@ -219,6 +232,7 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
}
}
Err(_sender_dropped_error) => {
info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop");
return ControlFlow::Break(());
}
}

View File

@@ -1,4 +1,3 @@
pub mod delete;
mod eviction_task;
pub mod layer_manager;
mod logical_size;
@@ -19,7 +18,6 @@ use pageserver_api::models::{
use remote_storage::GenericRemoteStorage;
use serde_with::serde_as;
use storage_broker::BrokerClientChannel;
use tokio::runtime::Handle;
use tokio::sync::{oneshot, watch, TryAcquireError};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -35,11 +33,8 @@ use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::context::{DownloadBehavior, RequestContext};
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
LayerAccessStats, LayerFileName, RemoteLayer,
@@ -84,7 +79,6 @@ use crate::METADATA_FILE_NAME;
use crate::ZERO_PAGE;
use crate::{is_temporary, task_mgr};
use self::delete::DeleteTimelineFlow;
pub(super) use self::eviction_task::EvictionTaskTenantState;
use self::eviction_task::EvictionTaskTimelineState;
use self::layer_manager::LayerManager;
@@ -243,10 +237,11 @@ pub struct Timeline {
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
/// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
/// This is an `Arc<Mutex>` lock because we need an owned
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
/// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
///
/// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -288,7 +283,7 @@ pub struct Timeline {
/// Prevent two tasks from deleting the timeline at the same time. If held, the
/// timeline is being deleted. If 'true', the timeline has already been deleted.
pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
pub delete_lock: Arc<tokio::sync::Mutex<bool>>,
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
@@ -298,10 +293,6 @@ pub struct Timeline {
/// Completion shared between all timelines loaded during startup; used to delay heavier
/// background tasks until some logical sizes have been calculated.
initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
/// Load or creation time information about the disk_consistent_lsn and when the loading
/// happened. Used for consumption metrics.
pub(crate) loaded_at: (Lsn, SystemTime),
}
pub struct WalReceiverInfo {
@@ -343,7 +334,7 @@ pub struct GcInfo {
#[derive(thiserror::Error)]
pub enum PageReconstructError {
#[error(transparent)]
Other(#[from] anyhow::Error),
Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
/// The operation would require downloading a layer that is missing locally.
NeedsDownload(TenantTimelineId, LayerFileName),
@@ -484,7 +475,7 @@ impl Timeline {
img: cached_page_img,
};
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
.await?;
timer.stop_and_record();
@@ -532,7 +523,7 @@ impl Timeline {
size
}
pub fn resident_physical_size(&self) -> u64 {
pub fn get_resident_physical_size(&self) -> u64 {
self.metrics.resident_physical_size_gauge.get()
}
@@ -564,7 +555,7 @@ impl Timeline {
"wait_lsn cannot be called in WAL receiver"
);
let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
let _timer = self.metrics.wait_lsn_time_histo.start_timer();
match self
.last_record_lsn
@@ -620,46 +611,9 @@ impl Timeline {
}
/// Outermost timeline compaction operation; downloads needed layers.
pub async fn compact(
self: &Arc<Self>,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<()> {
pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
const ROUNDS: usize = 2;
static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
once_cell::sync::Lazy::new(|| {
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
let permits = usize::max(
1,
// while a lot of the work is done on spawn_blocking, we still do
// repartitioning in the async context. this should give leave us some workers
// unblocked to be blocked on other work, hopefully easing any outside visible
// effects of restarts.
//
// 6/8 is a guess; previously we ran with unlimited 8 and more from
// spawn_blocking.
(total_threads * 3).checked_div(4).unwrap_or(0),
);
assert_ne!(permits, 0, "we will not be adding in permits later");
assert!(
permits < total_threads,
"need threads avail for shorter work"
);
tokio::sync::Semaphore::new(permits)
});
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let _permit = tokio::select! {
permit = CONCURRENT_COMPACTIONS.acquire() => {
permit
},
_ = cancel.cancelled() => {
return Ok(());
}
};
let last_record_lsn = self.get_last_record_lsn();
// Last record Lsn could be zero in case the timeline was just created
@@ -701,9 +655,6 @@ impl Timeline {
Err(CompactionError::DownloadRequired(rls)) => {
anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
}
Err(CompactionError::ShuttingDown) => {
return Ok(());
}
Err(CompactionError::Other(e)) => {
return Err(e);
}
@@ -720,9 +671,11 @@ impl Timeline {
let mut failed = 0;
let mut cancelled = pin!(task_mgr::shutdown_watcher());
loop {
tokio::select! {
_ = cancel.cancelled() => anyhow::bail!("Cancelled while downloading remote layers"),
_ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
res = downloads.next() => {
match res {
Some(Ok(())) => {},
@@ -785,8 +738,7 @@ impl Timeline {
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
// Is the timeline being deleted?
if self.is_stopping() {
trace!("Dropping out of compaction on timeline shutdown");
return Err(CompactionError::ShuttingDown);
return Err(anyhow::anyhow!("timeline is Stopping").into());
}
let target_file_size = self.get_checkpoint_distance();
@@ -802,15 +754,10 @@ impl Timeline {
.await
{
Ok((partitioning, lsn)) => {
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
let image_ctx = RequestContextBuilder::extend(ctx)
.access_stats_behavior(AccessStatsBehavior::Skip)
.build();
// 2. Create new image layers for partitions that have been modified
// "enough".
let layer_paths_to_upload = self
.create_image_layers(&partitioning, lsn, false, &image_ctx)
.create_image_layers(&partitioning, lsn, false, ctx)
.await
.map_err(anyhow::Error::from)?;
if let Some(remote_client) = &self.remote_client {
@@ -883,7 +830,7 @@ impl Timeline {
let Some(open_layer) = layers.open_layer.as_ref() else {
return Ok(());
};
open_layer.size().await?
open_layer.size()?
};
let last_freeze_at = self.last_freeze_at.load();
let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
@@ -927,7 +874,7 @@ impl Timeline {
pub fn set_state(&self, new_state: TimelineState) {
match (self.current_state(), new_state) {
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
}
(st, TimelineState::Loading) => {
error!("ignoring transition from {st:?} into Loading state");
@@ -943,7 +890,7 @@ impl Timeline {
new_state,
TimelineState::Stopping | TimelineState::Broken { .. }
) {
// drop the completion guard, if any; it might be holding off the completion
// drop the copmletion guard, if any; it might be holding off the completion
// forever needlessly
self.initial_logical_size_attempt
.lock()
@@ -1168,7 +1115,7 @@ impl Timeline {
return Err(EvictionError::CannotEvictRemoteLayer);
}
let layer_file_size = local_layer.layer_desc().file_size;
let layer_file_size = local_layer.file_size();
let local_layer_mtime = local_layer
.local_path()
@@ -1378,10 +1325,9 @@ impl Timeline {
pg_version: u32,
initial_logical_size_can_start: Option<completion::Barrier>,
initial_logical_size_attempt: Option<completion::Completion>,
state: TimelineState,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
let (state, _) = watch::channel(state);
let (state, _) = watch::channel(TimelineState::Loading);
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -1421,8 +1367,6 @@ impl Timeline {
last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
last_freeze_ts: RwLock::new(Instant::now()),
loaded_at: (disk_consistent_lsn, SystemTime::now()),
ancestor_timeline: ancestor,
ancestor_lsn: metadata.ancestor_lsn(),
@@ -1474,7 +1418,7 @@ impl Timeline {
eviction_task_timeline_state: tokio::sync::Mutex::new(
EvictionTaskTimelineState::default(),
),
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
delete_lock: Arc::new(tokio::sync::Mutex::new(false)),
initial_logical_size_can_start,
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
@@ -1598,6 +1542,7 @@ impl Timeline {
///
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
let mut guard = self.layers.write().await;
let mut num_layers = 0;
let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1615,12 +1560,12 @@ impl Timeline {
let fname = direntry.file_name();
let fname = fname.to_string_lossy();
if let Some(filename) = ImageFileName::parse_str(&fname) {
if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
// create an ImageLayer struct for each image file.
if filename.lsn > disk_consistent_lsn {
info!(
if imgfilename.lsn > disk_consistent_lsn {
warn!(
"found future image layer {} on timeline {} disk_consistent_lsn is {}",
filename, self.timeline_id, disk_consistent_lsn
imgfilename, self.timeline_id, disk_consistent_lsn
);
rename_to_backup(&direntry_path)?;
@@ -1628,31 +1573,31 @@ impl Timeline {
}
let file_size = direntry_path.metadata()?.len();
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
let layer = ImageLayer::new(
self.conf,
self.timeline_id,
self.tenant_id,
&filename,
&imgfilename,
file_size,
stats,
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
);
trace!("found layer {}", layer.path().display());
total_physical_size += file_size;
loaded_layers.push(Arc::new(layer));
} else if let Some(filename) = DeltaFileName::parse_str(&fname) {
num_layers += 1;
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
// Create a DeltaLayer struct for each delta file.
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
if filename.lsn_range.end > disk_consistent_lsn + 1 {
info!(
if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
warn!(
"found future delta layer {} on timeline {} disk_consistent_lsn is {}",
filename, self.timeline_id, disk_consistent_lsn
deltafilename, self.timeline_id, disk_consistent_lsn
);
rename_to_backup(&direntry_path)?;
@@ -1660,20 +1605,20 @@ impl Timeline {
}
let file_size = direntry_path.metadata()?.len();
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
let layer = DeltaLayer::new(
self.conf,
self.timeline_id,
self.tenant_id,
&filename,
&deltafilename,
file_size,
stats,
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
);
trace!("found layer {}", layer.path().display());
total_physical_size += file_size;
loaded_layers.push(Arc::new(layer));
num_layers += 1;
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
// ignore these
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
@@ -1698,7 +1643,6 @@ impl Timeline {
}
}
let num_layers = loaded_layers.len();
guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
info!(
@@ -1793,21 +1737,19 @@ impl Timeline {
match remote_layer_name {
LayerFileName::Image(imgfilename) => {
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
info!(
warn!(
"found future image layer {} on timeline {} remote_consistent_lsn is {}",
imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
);
continue;
}
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
let remote_layer = RemoteLayer::new_img(
self.tenant_id,
self.timeline_id,
imgfilename,
&remote_layer_metadata,
stats,
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
);
let remote_layer = Arc::new(remote_layer);
added_remote_layers.push(remote_layer);
@@ -1820,21 +1762,18 @@ impl Timeline {
// is 102, then it might not have been fully flushed to disk
// before crash.
if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
info!(
warn!(
"found future delta layer {} on timeline {} remote_consistent_lsn is {}",
deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
);
continue;
}
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
let remote_layer = RemoteLayer::new_delta(
self.tenant_id,
self.timeline_id,
deltafilename,
&remote_layer_metadata,
stats,
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
);
let remote_layer = Arc::new(remote_layer);
added_remote_layers.push(remote_layer);
@@ -1944,15 +1883,6 @@ impl Timeline {
}
fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
let state = self.current_state();
if matches!(
state,
TimelineState::Broken { .. } | TimelineState::Stopping
) {
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
return;
}
let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
.try_acquire_owned()
{
@@ -2282,16 +2212,15 @@ trait TraversalLayerExt {
impl TraversalLayerExt for Arc<dyn PersistentLayer> {
fn traversal_id(&self) -> TraversalId {
let timeline_id = self.layer_desc().timeline_id;
match self.local_path() {
Some(local_path) => {
debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", timeline_id)),
debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
"need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
);
format!("{}", local_path.display())
}
None => {
format!("remote {}/{self}", timeline_id)
format!("remote {}/{self}", self.get_timeline_id())
}
}
}
@@ -2323,9 +2252,8 @@ impl Timeline {
let mut timeline_owned;
let mut timeline = self;
let mut read_count = scopeguard::guard(0, |cnt| {
crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
});
let mut read_count =
scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
// For debugging purposes, collect the path of layers that we traversed
// through. It's included in the error message if we fail to find the key.
@@ -2459,15 +2387,12 @@ impl Timeline {
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match open_layer
.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
)
.await
{
result = match open_layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
) {
Ok(result) => result,
Err(e) => return Err(PageReconstructError::from(e)),
};
@@ -2489,15 +2414,12 @@ impl Timeline {
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
let lsn_floor = max(cached_lsn + 1, start_lsn);
result = match frozen_layer
.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
)
.await
{
result = match frozen_layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
) {
Ok(result) => result,
Err(e) => return Err(PageReconstructError::from(e)),
};
@@ -2528,15 +2450,12 @@ impl Timeline {
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, lsn_floor);
result = match layer
.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
)
.await
{
result = match layer.get_value_reconstruct_data(
key,
lsn_floor..cont_lsn,
reconstruct_state,
ctx,
) {
Ok(result) => result,
Err(e) => return Err(PageReconstructError::from(e)),
};
@@ -2655,7 +2574,7 @@ impl Timeline {
async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
//info!("PUT: key {} at {}", key, lsn);
let layer = self.get_layer_for_write(lsn).await?;
layer.put_value(key, lsn, val).await?;
layer.put_value(key, lsn, val)?;
Ok(())
}
@@ -2681,9 +2600,7 @@ impl Timeline {
Some(self.write_lock.lock().await)
};
let mut guard = self.layers.write().await;
guard
.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
.await;
guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
}
/// Layer flusher task's main loop.
@@ -2829,10 +2746,7 @@ impl Timeline {
// We will remove frozen layer and add delta layer in one atomic operation later.
let layer = self.create_delta_layer(&frozen_layer).await?;
(
HashMap::from([(
layer.filename(),
LayerFileMetadata::new(layer.layer_desc().file_size),
)]),
HashMap::from([(layer.filename(), LayerFileMetadata::new(layer.file_size()))]),
Some(layer),
)
};
@@ -2852,7 +2766,7 @@ impl Timeline {
);
// update metrics
let sz = l.layer_desc().file_size;
let sz = l.file_size();
self.metrics.resident_physical_size_gauge.add(sz);
self.metrics.num_persistent_files_created.inc_by(1);
self.metrics.persistent_bytes_written.inc_by(sz);
@@ -2965,11 +2879,7 @@ impl Timeline {
let frozen_layer = Arc::clone(frozen_layer);
move || {
// Write it out
// Keep this inside `spawn_blocking` and `Handle::current`
// as long as the write path is still sync and the read impl
// is still not fully async. Otherwise executor threads would
// be blocked.
let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
let new_delta = frozen_layer.write_to_disk()?;
let new_delta_path = new_delta.path();
// Sync it to disk.
@@ -3263,8 +3173,6 @@ enum CompactionError {
/// This should not happen repeatedly, but will be retried once by top-level
/// `Timeline::compact`.
DownloadRequired(Vec<Arc<RemoteLayer>>),
/// The timeline or pageserver is shutting down
ShuttingDown,
/// Compaction cannot be done right now; page reconstruction and so on.
Other(anyhow::Error),
}
@@ -3313,10 +3221,10 @@ struct CompactLevel0Phase1StatsBuilder {
timeline_id: Option<TimelineId>,
read_lock_acquisition_micros: DurationRecorder,
read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
read_lock_held_key_sort_micros: DurationRecorder,
read_lock_held_prerequisites_micros: DurationRecorder,
read_lock_held_compute_holes_micros: DurationRecorder,
read_lock_drop_micros: DurationRecorder,
prepare_iterators_micros: DurationRecorder,
write_layer_files_micros: DurationRecorder,
level0_deltas_count: Option<usize>,
new_deltas_count: Option<usize>,
@@ -3333,10 +3241,10 @@ struct CompactLevel0Phase1Stats {
timeline_id: TimelineId,
read_lock_acquisition_micros: RecordedDuration,
read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
read_lock_held_key_sort_micros: RecordedDuration,
read_lock_held_prerequisites_micros: RecordedDuration,
read_lock_held_compute_holes_micros: RecordedDuration,
read_lock_drop_micros: RecordedDuration,
prepare_iterators_micros: RecordedDuration,
write_layer_files_micros: RecordedDuration,
level0_deltas_count: usize,
new_deltas_count: usize,
@@ -3363,10 +3271,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
.read_lock_held_spawn_blocking_startup_micros
.into_recorded()
.ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
read_lock_held_key_sort_micros: value
.read_lock_held_key_sort_micros
.into_recorded()
.ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
read_lock_held_prerequisites_micros: value
.read_lock_held_prerequisites_micros
.into_recorded()
@@ -3379,6 +3283,10 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
.read_lock_drop_micros
.into_recorded()
.ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
prepare_iterators_micros: value
.prepare_iterators_micros
.into_recorded()
.ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?,
write_layer_files_micros: value
.write_layer_files_micros
.into_recorded()
@@ -3475,14 +3383,14 @@ impl Timeline {
// "gaps" in the sequence of level 0 files should only happen in case
// of a crash, partial download from cloud storage, or something like
// that, so it's not a big deal in practice.
level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
level0_deltas.sort_by_key(|l| l.get_lsn_range().start);
let mut level0_deltas_iter = level0_deltas.iter();
let first_level0_delta = level0_deltas_iter.next().unwrap();
let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
let mut prev_lsn_end = first_level0_delta.get_lsn_range().end;
let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)];
for l in level0_deltas_iter {
let lsn_range = &l.layer_desc().lsn_range;
let lsn_range = l.get_lsn_range();
if lsn_range.start != prev_lsn_end {
break;
@@ -3491,13 +3399,8 @@ impl Timeline {
prev_lsn_end = lsn_range.end;
}
let lsn_range = Range {
start: deltas_to_compact
.first()
.unwrap()
.layer_desc()
.lsn_range
.start,
end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
start: deltas_to_compact.first().unwrap().get_lsn_range().start,
end: deltas_to_compact.last().unwrap().get_lsn_range().end,
};
let remotes = deltas_to_compact
@@ -3547,26 +3450,10 @@ impl Timeline {
// min-heap (reserve space for one more element added before eviction)
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
let mut prev: Option<Key> = None;
let mut all_keys = Vec::new();
let downcast_deltas: Vec<_> = deltas_to_compact
.iter()
.map(|l| l.clone().downcast_delta_layer().expect("delta layer"))
.collect();
for dl in downcast_deltas.iter() {
// TODO: replace this with an await once we fully go async
all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
}
// The current stdlib sorting implementation is designed in a way where it is
// particularly fast where the slice is made up of sorted sub-ranges.
all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
for DeltaEntry { key: next_key, .. } in all_keys.iter() {
let next_key = *next_key;
for (next_key, _next_lsn, _size) in itertools::process_results(
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
)? {
if let Some(prev_key) = prev {
// just first fast filter
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
@@ -3589,7 +3476,8 @@ impl Timeline {
}
prev = Some(next_key.next());
}
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
stats.read_lock_held_compute_holes_micros =
stats.read_lock_held_prerequisites_micros.till_now();
drop_rlock(guard);
stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
let mut holes = heap.into_vec();
@@ -3598,26 +3486,36 @@ impl Timeline {
// This iterator walks through all key-value pairs from all the layers
// we're compacting, in key, LSN order.
let all_values_iter = all_keys.iter();
let all_values_iter = itertools::process_results(
deltas_to_compact.iter().map(|l| l.iter(ctx)),
|iter_iter| {
iter_iter.kmerge_by(|a, b| {
if let Ok((a_key, a_lsn, _)) = a {
if let Ok((b_key, b_lsn, _)) = b {
(a_key, a_lsn) < (b_key, b_lsn)
} else {
false
}
} else {
true
}
})
},
)?;
// This iterator walks through all keys and is needed to calculate size used by each key
let mut all_keys_iter = all_keys
.iter()
.map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
.coalesce(|mut prev, cur| {
// Coalesce keys that belong to the same key pair.
// This ensures that compaction doesn't put them
// into different layer files.
// Still limit this by the target file size,
// so that we keep the size of the files in
// check.
if prev.0 == cur.0 && prev.2 < target_file_size {
prev.2 += cur.2;
Ok(prev)
} else {
Err((prev, cur))
}
});
let mut all_keys_iter = itertools::process_results(
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|iter_iter| {
iter_iter.kmerge_by(|a, b| {
let (a_key, a_lsn, _) = a;
let (b_key, b_lsn, _) = b;
(a_key, a_lsn) < (b_key, b_lsn)
})
},
)?;
stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();
// Merge the contents of all the input delta layers into a new set
// of delta layers, based on the current partitioning.
@@ -3669,127 +3567,104 @@ impl Timeline {
let mut key_values_total_size = 0u64;
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
// TODO remove this block_on wrapper once we fully go async
Handle::current().block_on(async {
for &DeltaEntry {
key, lsn, ref val, ..
} in all_values_iter
{
let value = val.load().await?;
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
// We need to check key boundaries once we reach next key or end of layer with the same key
if !same_key || lsn == dup_end_lsn {
let mut next_key_size = 0u64;
let is_dup_layer = dup_end_lsn.is_valid();
dup_start_lsn = Lsn::INVALID;
if !same_key {
dup_end_lsn = Lsn::INVALID;
}
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
next_key_size = next_size;
if key != next_key {
if dup_end_lsn.is_valid() {
// We are writting segment with duplicates:
// place all remaining values of this key in separate segment
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
}
break;
}
key_values_total_size += next_size;
// Check if it is time to split segment: if total keys size is larger than target file size.
// We need to avoid generation of empty segments if next_size > target_file_size.
if key_values_total_size > target_file_size && lsn != next_lsn {
// Split key between multiple layers: such layer can contain only single key
dup_start_lsn = if dup_end_lsn.is_valid() {
dup_end_lsn // new segment with duplicates starts where old one stops
} else {
lsn // start with the first LSN for this key
};
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
break;
}
}
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
dup_start_lsn = dup_end_lsn;
dup_end_lsn = lsn_range.end;
}
if writer.is_some() {
let written_size = writer.as_mut().unwrap().size();
let contains_hole =
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
// check if key cause layer overflow or contains hole...
if is_dup_layer
|| dup_end_lsn.is_valid()
|| written_size + key_values_total_size > target_file_size
|| contains_hole
{
// ... if so, flush previous layer and prepare to write new one
new_layers.push(Arc::new(
writer.take().unwrap().finish(prev_key.unwrap().next())?,
));
writer = None;
if contains_hole {
// skip hole
next_hole += 1;
}
}
}
// Remember size of key value because at next iteration we will access next item
key_values_total_size = next_key_size;
for x in all_values_iter {
let (key, lsn, value) = x?;
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
// We need to check key boundaries once we reach next key or end of layer with the same key
if !same_key || lsn == dup_end_lsn {
let mut next_key_size = 0u64;
let is_dup_layer = dup_end_lsn.is_valid();
dup_start_lsn = Lsn::INVALID;
if !same_key {
dup_end_lsn = Lsn::INVALID;
}
if writer.is_none() {
// Create writer if not initiaized yet
writer = Some(DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_id,
key,
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
next_key_size = next_size;
if key != next_key {
if dup_end_lsn.is_valid() {
// this is a layer containing slice of values of the same key
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
dup_start_lsn..dup_end_lsn
// We are writting segment with duplicates:
// place all remaining values of this key in separate segment
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
}
break;
}
key_values_total_size += next_size;
// Check if it is time to split segment: if total keys size is larger than target file size.
// We need to avoid generation of empty segments if next_size > target_file_size.
if key_values_total_size > target_file_size && lsn != next_lsn {
// Split key between multiple layers: such layer can contain only single key
dup_start_lsn = if dup_end_lsn.is_valid() {
dup_end_lsn // new segment with duplicates starts where old one stops
} else {
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
lsn_range.clone()
},
)?);
lsn // start with the first LSN for this key
};
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
break;
}
}
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
dup_start_lsn = dup_end_lsn;
dup_end_lsn = lsn_range.end;
}
if writer.is_some() {
let written_size = writer.as_mut().unwrap().size();
let contains_hole =
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
// check if key cause layer overflow or contains hole...
if is_dup_layer
|| dup_end_lsn.is_valid()
|| written_size + key_values_total_size > target_file_size
|| contains_hole
{
// ... if so, flush previous layer and prepare to write new one
new_layers.push(Arc::new(
writer.take().unwrap().finish(prev_key.unwrap().next())?,
));
writer = None;
fail_point!("delta-layer-writer-fail-before-finish", |_| {
Result::<_>::Err(anyhow::anyhow!(
"failpoint delta-layer-writer-fail-before-finish"
))
});
writer.as_mut().unwrap().put_value(key, lsn, value)?;
prev_key = Some(key);
if contains_hole {
// skip hole
next_hole += 1;
}
}
}
// Remember size of key value because at next iteration we will access next item
key_values_total_size = next_key_size;
}
Ok(())
})?;
if writer.is_none() {
// Create writer if not initiaized yet
writer = Some(DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_id,
key,
if dup_end_lsn.is_valid() {
// this is a layer containing slice of values of the same key
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
dup_start_lsn..dup_end_lsn
} else {
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
lsn_range.clone()
},
)?);
}
fail_point!("delta-layer-writer-fail-before-finish", |_| {
Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
});
writer.as_mut().unwrap().put_value(key, lsn, value)?;
prev_key = Some(key);
}
if let Some(writer) = writer {
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
}
// Sync layers
if !new_layers.is_empty() {
// Print a warning if the created layer is larger than double the target size
// Add two pages for potential overhead. This should in theory be already
// accounted for in the target calculation, but for very small targets,
// we still might easily hit the limit otherwise.
let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
for layer in new_layers.iter() {
if layer.desc.file_size > warn_limit {
warn!(
%layer,
"created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
);
}
}
let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
// Fsync all the layer files and directory using multiple threads to
@@ -3802,10 +3677,12 @@ impl Timeline {
layer_paths.pop().unwrap();
}
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now();
stats.new_deltas_count = Some(new_layers.len());
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());
drop(all_keys_iter); // So that deltas_to_compact is no longer borrowed
match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
.and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
{
@@ -4703,7 +4580,7 @@ impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
impl LocalLayerInfoForDiskUsageEviction {
pub fn file_size(&self) -> u64 {
self.layer.layer_desc().file_size
self.layer.file_size()
}
}

View File

@@ -1,594 +0,0 @@
use std::{
ops::{Deref, DerefMut},
sync::Arc,
};
use anyhow::Context;
use pageserver_api::models::TimelineState;
use tokio::sync::OwnedMutexGuard;
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
use utils::{
crashsafe, fs_ext,
id::{TenantId, TimelineId},
};
use crate::{
config::PageServerConf,
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
remote_timeline_client::{
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
},
CreateTimelineCause, DeleteTimelineError, Tenant,
},
InitializationOrder,
};
use super::Timeline;
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
// Stop the walreceiver first.
debug!("waiting for wal receiver to shutdown");
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
if let Some(walreceiver) = maybe_started_walreceiver {
walreceiver.stop().await;
}
debug!("wal receiver shutdown confirmed");
// Prevent new uploads from starting.
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.stop();
match res {
Ok(()) => {}
Err(e) => match e {
remote_timeline_client::StopError::QueueUninitialized => {
// This case shouldn't happen currently because the
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
// That is, before we declare the Tenant as Active.
// But we only allow calls to delete_timeline on Active tenants.
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
}
},
}
}
// Stop & wait for the remaining timeline tasks, including upload tasks.
// NB: This and other delete_timeline calls do not run as a task_mgr task,
// so, they are not affected by this shutdown_tasks() call.
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-deleted-at"
))?
});
Ok(())
}
/// Mark timeline as deleted in S3 so we won't pick it up next time
/// during attach or pageserver restart.
/// See comment in persist_index_part_with_deleted_flag.
async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
if let Some(remote_client) = timeline.remote_client.as_ref() {
match remote_client.persist_index_part_with_deleted_flag().await {
// If we (now, or already) marked it successfully as deleted, we can proceed
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
// Bail out otherwise
//
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
// two tasks from performing the deletion at the same time. The first task
// that starts deletion should run it to completion.
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
}
}
}
Ok(())
}
// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
// So we can just remove the mark file.
async fn create_delete_mark(
conf: &PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<(), DeleteTimelineError> {
fail::fail_point!("timeline-delete-before-delete-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-delete-mark"
))?
});
let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
// Note: we're ok to replace existing file.
let _ = std::fs::OpenOptions::new()
.write(true)
.create(true)
.open(&marker_path)
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
Ok(())
}
/// Grab the layer_removal_cs lock, and actually perform the deletion.
///
/// This lock prevents prevents GC or compaction from running at the same time.
/// The GC task doesn't register itself with the timeline it's operating on,
/// so it might still be running even though we called `shutdown_tasks`.
///
/// Note that there are still other race conditions between
/// GC, compaction and timeline deletion. See
/// <https://github.com/neondatabase/neon/issues/2671>
///
/// No timeout here, GC & Compaction should be responsive to the
/// `TimelineState::Stopping` change.
async fn delete_local_layer_files(
conf: &PageServerConf,
tenant_id: TenantId,
timeline: &Timeline,
) -> anyhow::Result<()> {
info!("waiting for layer_removal_cs.lock()");
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
info!("got layer_removal_cs.lock(), deleting layer files");
// NB: storage_sync upload tasks that reference these layers have been cancelled
// by the caller.
let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
fail::fail_point!("timeline-delete-before-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
});
// NB: This need not be atomic because the deleted flag in the IndexPart
// will be observed during tenant/timeline load. The deletion will be resumed there.
//
// For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
//
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
// This can happen if we're called a second time, e.g.,
// because of a previous failure/cancellation at/after
// failpoint timeline-delete-after-rm.
//
// It can also happen if we race with tenant detach, because,
// it doesn't grab the layer_removal_cs lock.
//
// For now, log and continue.
// warn! level is technically not appropriate for the
// first case because we should expect retries to happen.
// But the error is so rare, it seems better to get attention if it happens.
//
// Note that metadata removal is skipped, this is not technically needed,
// but allows to reuse timeline loading code during resumed deletion.
// (we always expect that metadata is in place when timeline is being loaded)
#[cfg(feature = "testing")]
let mut counter = 0;
// Timeline directory may not exist if we failed to delete mark file and request was retried.
if !local_timeline_directory.exists() {
return Ok(());
}
let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
#[cfg(feature = "testing")]
{
counter += 1;
if counter == 2 {
fail::fail_point!("timeline-delete-during-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
});
}
}
let entry = entry?;
if entry.path() == metadata_path {
debug!("found metadata, skipping");
continue;
}
if entry.path() == local_timeline_directory {
// Keeping directory because metedata file is still there
debug!("found timeline dir itself, skipping");
continue;
}
let metadata = match entry.metadata() {
Ok(metadata) => metadata,
Err(e) => {
if crate::is_walkdir_io_not_found(&e) {
warn!(
timeline_dir=?local_timeline_directory,
path=?entry.path().display(),
"got not found err while removing timeline dir, proceeding anyway"
);
continue;
}
anyhow::bail!(e);
}
};
if metadata.is_dir() {
warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
tokio::fs::remove_dir(entry.path()).await
} else {
tokio::fs::remove_file(entry.path()).await
}
.with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
}
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
drop(layer_removal_guard);
fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
});
Ok(())
}
/// Removes remote layers and an index file after them.
async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
if let Some(remote_client) = &timeline.remote_client {
remote_client.delete_all().await.context("delete_all")?
};
Ok(())
}
// This function removs remaining traces of a timeline on disk.
// Namely: metadata file, timeline directory, delete mark.
// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
// delete mark should be present because it is the last step during deletion.
// (nothing can fail after its deletion)
async fn cleanup_remaining_timeline_fs_traces(
conf: &PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
// Remove local metadata
tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("remove metadata")?;
fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-after-rm-metadata"
))?
});
// Remove timeline dir
tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("timeline dir")?;
fail::fail_point!("timeline-delete-after-rm-dir", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
});
// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
// So its possible for mark to reach disk first and for other deletions
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let timeline_path = conf.timelines_path(&tenant_id);
crashsafe::fsync_async(timeline_path)
.await
.context("fsync_pre_mark_remove")?;
// Remove delete mark
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
.await
.context("remove delete mark")
}
/// It is important that this gets called when DeletionGuard is being held.
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
async fn remove_timeline_from_tenant(
tenant: &Tenant,
timeline_id: TimelineId,
_: &DeletionGuard, // using it as a witness
) -> anyhow::Result<()> {
// Remove the timeline from the map.
let mut timelines = tenant.timelines.lock().unwrap();
let children_exist = timelines
.iter()
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
// We already deleted the layer files, so it's probably best to panic.
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
if children_exist {
panic!("Timeline grew children while we removed layer files");
}
timelines
.remove(&timeline_id)
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
drop(timelines);
Ok(())
}
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
/// and deletes its data from both disk and s3.
/// The sequence of steps:
/// 1. Set deleted_at in remote index part.
/// 2. Create local mark file.
/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
/// 4. Delete remote layers
/// 5. Delete index part
/// 6. Delete meta, timeline directory
/// 7. Delete mark file
/// It is resumable from any step in case a crash/restart occurs.
/// There are three entrypoints to the process:
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
/// and we possibly neeed to continue deletion of remote files.
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
/// index but still have local metadata, timeline directory and delete mark.
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
#[derive(Default)]
pub enum DeleteTimelineFlow {
#[default]
NotStarted,
InProgress,
Finished,
}
impl DeleteTimelineFlow {
// These steps are run in the context of management api request handler.
// Long running steps are continued to run in the background.
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
pub async fn run(
tenant: &Arc<Tenant>,
timeline_id: TimelineId,
inplace: bool,
) -> Result<(), DeleteTimelineError> {
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
guard.mark_in_progress()?;
stop_tasks(&timeline).await?;
set_deleted_in_remote_index(&timeline).await?;
create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
fail::fail_point!("timeline-delete-before-schedule", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-schedule"
))?
});
if inplace {
Self::background(guard, tenant.conf, tenant, &timeline).await?
} else {
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
}
Ok(())
}
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
match self {
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
Self::InProgress { .. } => { /* We're in a retry */ }
Self::NotStarted => { /* Fresh start */ }
}
*self = Self::InProgress;
Ok(())
}
/// Shortcut to create Timeline in stopping state and spawn deletion task.
/// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
#[instrument(skip_all, fields(%timeline_id))]
pub async fn resume_deletion(
tenant: Arc<Tenant>,
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: Option<RemoteTimelineClient>,
init_order: Option<&InitializationOrder>,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
// RemoteTimelineClient is the only functioning part.
let timeline = tenant
.create_timeline_struct(
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
remote_client,
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.
CreateTimelineCause::Delete,
)
.context("create_timeline_struct")?;
let mut guard = DeletionGuard(
Arc::clone(&timeline.delete_progress)
.try_lock_owned()
.expect("cannot happen because we're the only owner"),
);
// We meed to do this because when console retries delete request we shouldnt answer with 404
// because 404 means successful deletion.
{
let mut locked = tenant.timelines.lock().unwrap();
locked.insert(timeline_id, Arc::clone(&timeline));
}
guard.mark_in_progress()?;
// Note that delete mark can be missing on resume
// because we create delete mark after we set deleted_at in the index part.
create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
Self::schedule_background(guard, tenant.conf, tenant, timeline);
Ok(())
}
#[instrument(skip_all, fields(%timeline_id))]
pub async fn cleanup_remaining_timeline_fs_traces(
tenant: &Tenant,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
let r =
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
info!("Done");
r
}
fn prepare(
tenant: &Tenant,
timeline_id: TimelineId,
) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
// Note the interaction between this guard and deletion guard.
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
// This is important because when you take into account `remove_timeline_from_tenant`
// we remove timeline from memory when we still hold the deletion guard.
// So here when timeline deletion is finished timeline wont be present in timelines map at all
// which makes the following sequence impossible:
// T1: get preempted right before the try_lock on `Timeline::delete_progress`
// T2: do a full deletion, acquire and drop `Timeline::delete_progress`
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
let timelines = tenant.timelines.lock().unwrap();
let timeline = match timelines.get(&timeline_id) {
Some(t) => t,
None => return Err(DeleteTimelineError::NotFound),
};
// Ensure that there are no child timelines **attached to that pageserver**,
// because detach removes files, which will break child branches
let children: Vec<TimelineId> = timelines
.iter()
.filter_map(|(id, entry)| {
if entry.get_ancestor_timeline_id() == Some(timeline_id) {
Some(*id)
} else {
None
}
})
.collect();
if !children.is_empty() {
return Err(DeleteTimelineError::HasChildren(children));
}
// Note that using try_lock here is important to avoid a deadlock.
// Here we take lock on timelines and then the deletion guard.
// At the end of the operation we're holding the guard and need to lock timelines map
// to remove the timeline from it.
// Always if you have two locks that are taken in different order this can result in a deadlock.
let delete_progress = Arc::clone(&timeline.delete_progress);
let delete_lock_guard = match delete_progress.try_lock_owned() {
Ok(guard) => DeletionGuard(guard),
Err(_) => {
// Unfortunately if lock fails arc is consumed.
return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
&timeline.delete_progress,
)));
}
};
timeline.set_state(TimelineState::Stopping);
Ok((Arc::clone(timeline), delete_lock_guard))
}
fn schedule_background(
guard: DeletionGuard,
conf: &'static PageServerConf,
tenant: Arc<Tenant>,
timeline: Arc<Timeline>,
) {
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_id),
Some(timeline_id),
"timeline_delete",
false,
async move {
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
error!("Error: {err:#}");
timeline.set_broken(format!("{err:#}"))
};
Ok(())
}
.instrument({
let span =
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
span.follows_from(Span::current());
span
}),
);
}
async fn background(
mut guard: DeletionGuard,
conf: &PageServerConf,
tenant: &Tenant,
timeline: &Timeline,
) -> Result<(), DeleteTimelineError> {
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
delete_remote_layers_and_index(timeline).await?;
pausable_failpoint!("in_progress_delete");
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
*guard = Self::Finished;
Ok(())
}
pub(crate) fn is_finished(&self) -> bool {
matches!(self, Self::Finished)
}
}
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
impl Deref for DeletionGuard {
type Target = DeleteTimelineFlow;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for DeletionGuard {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

View File

@@ -78,6 +78,9 @@ impl Timeline {
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
scopeguard::defer! {
info!("eviction task finishing");
}
use crate::tenant::tasks::random_init_delay;
{
let policy = self.get_eviction_policy();
@@ -305,13 +308,8 @@ impl Timeline {
ctx: &RequestContext,
) -> ControlFlow<()> {
let mut state = self.eviction_task_timeline_state.lock().await;
// Only do the imitate_layer accesses approximately as often as the threshold. A little
// more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
_ => {
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
.await;
@@ -334,7 +332,7 @@ impl Timeline {
};
let mut state = tenant.eviction_task_tenant_state.lock().await;
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
_ => {
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
.await;

View File

@@ -120,9 +120,10 @@ impl LayerManager {
ensure!(
lsn > last_record_lsn,
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
"cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
lsn,
last_record_lsn,
std::backtrace::Backtrace::force_capture(),
);
// Do we have a layer open for writing already?
@@ -163,7 +164,7 @@ impl LayerManager {
}
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
pub async fn try_freeze_in_memory_layer(
pub fn try_freeze_in_memory_layer(
&mut self,
Lsn(last_record_lsn): Lsn,
last_freeze_at: &AtomicLsn,
@@ -173,7 +174,7 @@ impl LayerManager {
if let Some(open_layer) = &self.layer_map.open_layer {
let open_layer_rc = Arc::clone(open_layer);
// Does this layer need freezing?
open_layer.freeze(end_lsn).await;
open_layer.freeze(end_lsn);
// The layer is no longer open, update the layer map to reflect this.
// We will replace it with on-disk historics below.
@@ -277,7 +278,7 @@ impl LayerManager {
updates: &mut BatchedUpdates<'_>,
mapping: &mut LayerFileManager,
) {
updates.remove_historic(layer.layer_desc());
updates.remove_historic(layer.layer_desc().clone());
mapping.remove(layer);
}
@@ -291,10 +292,10 @@ impl LayerManager {
metrics: &TimelineMetrics,
mapping: &mut LayerFileManager,
) -> anyhow::Result<()> {
let desc = layer.layer_desc();
if !layer.is_remote_layer() {
layer.delete_resident_layer_file()?;
metrics.resident_physical_size_gauge.sub(desc.file_size);
let layer_file_size = layer.file_size();
metrics.resident_physical_size_gauge.sub(layer_file_size);
}
// TODO Removing from the bottom of the layer map is expensive.
@@ -302,7 +303,7 @@ impl LayerManager {
// won't be needed for page reconstruction for this timeline,
// and mark what we can't delete yet as deleted from the layer
// map index without actually rebuilding the index.
updates.remove_historic(desc);
updates.remove_historic(layer.layer_desc().clone());
mapping.remove(layer);
Ok(())

View File

@@ -2,9 +2,13 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
use anyhow::Context;
use tracing::{error, info, info_span, warn};
use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
use utils::{crashsafe, id::TimelineId, lsn::Lsn};
use crate::{context::RequestContext, import_datadir, tenant::Tenant};
use crate::{
context::RequestContext,
import_datadir,
tenant::{ignore_absent_files, Tenant},
};
use super::Timeline;
@@ -137,7 +141,7 @@ impl Drop for UninitializedTimeline<'_> {
pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
let timeline_path = &uninit_mark.timeline_path;
match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
Ok(()) => {
info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
}
@@ -181,7 +185,7 @@ impl TimelineUninitMark {
let uninit_mark_parent = uninit_mark_file
.parent()
.with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
})?;
crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;

View File

@@ -31,19 +31,14 @@ use storage_broker::Streaming;
use tokio::select;
use tracing::*;
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
use postgres_connection::{parse_host_port, PgConnectionConfig};
use utils::backoff::{
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use utils::{
id::{NodeId, TenantTimelineId},
lsn::Lsn,
};
use super::{
walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError,
TaskEvent, TaskHandle,
};
use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
/// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
/// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
@@ -424,19 +419,13 @@ impl ConnectionManagerState {
match res {
Ok(()) => Ok(()),
Err(e) => {
match e {
WalReceiverError::SuccessfulCompletion(msg) => {
info!("walreceiver connection handling ended with success: {msg}");
Ok(())
}
WalReceiverError::ExpectedSafekeeperError(e) => {
info!("walreceiver connection handling ended: {e}");
Ok(())
}
WalReceiverError::Other(e) => {
// give out an error to have task_mgr give it a really verbose logging
Err(e).context("walreceiver connection handling failure")
}
use super::walreceiver_connection::ExpectedError;
if e.is_expected() {
info!("walreceiver connection handling ended: {e:#}");
Ok(())
} else {
// give out an error to have task_mgr give it a really verbose logging
Err(e).context("walreceiver connection handling failure")
}
}
}
@@ -1134,7 +1123,7 @@ mod tests {
}
#[tokio::test]
async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
@@ -1200,8 +1189,8 @@ mod tests {
}
#[tokio::test]
async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
let now = Utc::now().naive_utc();
@@ -1263,8 +1252,8 @@ mod tests {
}
#[tokio::test]
async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
let new_lsn = Lsn(100_100).align();

View File

@@ -8,14 +8,14 @@ use std::{
time::{Duration, SystemTime},
};
use anyhow::{anyhow, Context};
use anyhow::{bail, ensure, Context};
use bytes::BytesMut;
use chrono::{NaiveDateTime, Utc};
use fail::fail_point;
use futures::StreamExt;
use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
use postgres_ffi::v14::xlog_utils::normalize_lsn;
use postgres_ffi::WAL_SEGMENT_SIZE;
use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
use postgres_protocol::message::backend::ReplicationMessage;
use postgres_types::PgLsn;
use tokio::{select, sync::watch, time};
@@ -60,50 +60,6 @@ pub(super) struct WalConnectionStatus {
pub node: NodeId,
}
pub(super) enum WalReceiverError {
/// An error of a type that does not indicate an issue, e.g. a connection closing
ExpectedSafekeeperError(postgres::Error),
/// An "error" message that carries a SUCCESSFUL_COMPLETION status code. Carries
/// the message part of the original postgres error
SuccessfulCompletion(String),
/// Generic error
Other(anyhow::Error),
}
impl From<tokio_postgres::Error> for WalReceiverError {
fn from(err: tokio_postgres::Error) -> Self {
if let Some(dberror) = err.as_db_error().filter(|db_error| {
db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
&& db_error.message().contains("ending streaming")
}) {
// Strip the outer DbError, which carries a misleading "error" severity
Self::SuccessfulCompletion(dberror.message().to_string())
} else if err.is_closed()
|| err
.source()
.and_then(|source| source.downcast_ref::<std::io::Error>())
.map(is_expected_io_error)
.unwrap_or(false)
{
Self::ExpectedSafekeeperError(err)
} else {
Self::Other(anyhow::Error::new(err))
}
}
}
impl From<anyhow::Error> for WalReceiverError {
fn from(err: anyhow::Error) -> Self {
Self::Other(err)
}
}
impl From<WalDecodeError> for WalReceiverError {
fn from(err: WalDecodeError) -> Self {
Self::Other(anyhow::Error::new(err))
}
}
/// Open a connection to the given safekeeper and receive WAL, sending back progress
/// messages as we go.
pub(super) async fn handle_walreceiver_connection(
@@ -114,7 +70,7 @@ pub(super) async fn handle_walreceiver_connection(
connect_timeout: Duration,
ctx: RequestContext,
node: NodeId,
) -> Result<(), WalReceiverError> {
) -> anyhow::Result<()> {
debug_assert_current_span_has_tenant_and_timeline_id();
WALRECEIVER_STARTED_CONNECTIONS.inc();
@@ -174,15 +130,11 @@ pub(super) async fn handle_walreceiver_connection(
connection_result = connection => match connection_result {
Ok(()) => debug!("Walreceiver db connection closed"),
Err(connection_error) => {
match WalReceiverError::from(connection_error) {
WalReceiverError::ExpectedSafekeeperError(_) => {
// silence, because most likely we've already exited the outer call
// with a similar error.
},
WalReceiverError::SuccessfulCompletion(_) => {}
WalReceiverError::Other(err) => {
warn!("Connection aborted: {err:#}")
}
if connection_error.is_expected() {
// silence, because most likely we've already exited the outer call
// with a similar error.
} else {
warn!("Connection aborted: {connection_error:#}")
}
}
},
@@ -228,7 +180,7 @@ pub(super) async fn handle_walreceiver_connection(
let mut startpoint = last_rec_lsn;
if startpoint == Lsn(0) {
return Err(WalReceiverError::Other(anyhow!("No previous WAL position")));
bail!("No previous WAL position");
}
// There might be some padding after the last full record, skip it.
@@ -310,9 +262,7 @@ pub(super) async fn handle_walreceiver_connection(
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
// at risk of hitting a deadlock.
if !lsn.is_aligned() {
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
}
ensure!(lsn.is_aligned());
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
@@ -469,3 +419,51 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
Err(IdentifyError.into())
}
}
/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
pub(super) trait ExpectedError {
/// Test if this error is an ok error.
///
/// We don't want to report connectivity problems as real errors towards connection manager because
/// 1. they happen frequently enough to make server logs hard to read and
/// 2. the connection manager can retry other safekeeper.
///
/// If this function returns `true`, it's such an error.
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
/// Connection manager will then handle reconnections.
///
/// If this function returns an `false` the error should be propagated and the connection manager
/// will log the error at ERROR level.
fn is_expected(&self) -> bool;
}
impl ExpectedError for postgres::Error {
fn is_expected(&self) -> bool {
self.is_closed()
|| self
.source()
.and_then(|source| source.downcast_ref::<std::io::Error>())
.map(is_expected_io_error)
.unwrap_or(false)
|| self
.as_db_error()
.filter(|db_error| {
db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
&& db_error.message().contains("ending streaming")
})
.is_some()
}
}
impl ExpectedError for anyhow::Error {
fn is_expected(&self) -> bool {
let head = self.downcast_ref::<postgres::Error>();
let tail = self
.chain()
.filter_map(|e| e.downcast_ref::<postgres::Error>());
// check if self or any of the chained/sourced errors are expected
head.into_iter().chain(tail).any(|e| e.is_expected())
}
}

View File

@@ -53,9 +53,6 @@ pub struct VirtualFile {
pub path: PathBuf,
open_options: OpenOptions,
// These are strings becase we only use them for metrics, and those expect strings.
// It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
// strings.
tenant_id: String,
timeline_id: String,
}
@@ -152,10 +149,12 @@ impl OpenFiles {
// old file.
//
if let Some(old_file) = slot_guard.file.take() {
// the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
// distinguish the two.
// We do not have information about tenant_id/timeline_id of evicted file.
// It is possible to store path together with file or use filepath crate,
// but as far as close() is not expected to be fast, it is not so critical to gather
// precise per-tenant statistic here.
STORAGE_IO_TIME
.with_label_values(&["close-by-replace"])
.with_label_values(&["close", "-", "-"])
.observe_closure_duration(|| drop(old_file));
}
@@ -209,7 +208,7 @@ impl VirtualFile {
}
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
let file = STORAGE_IO_TIME
.with_label_values(&["open"])
.with_label_values(&["open", &tenant_id, &timeline_id])
.observe_closure_duration(|| open_options.open(path))?;
// Strip all options other than read and write.
@@ -272,7 +271,7 @@ impl VirtualFile {
// Found a cached file descriptor.
slot.recently_used.store(true, Ordering::Relaxed);
return Ok(STORAGE_IO_TIME
.with_label_values(&[op])
.with_label_values(&[op, &self.tenant_id, &self.timeline_id])
.observe_closure_duration(|| func(file)));
}
}
@@ -299,12 +298,12 @@ impl VirtualFile {
// Open the physical file
let file = STORAGE_IO_TIME
.with_label_values(&["open"])
.with_label_values(&["open", &self.tenant_id, &self.timeline_id])
.observe_closure_duration(|| self.open_options.open(&self.path))?;
// Perform the requested operation on it
let result = STORAGE_IO_TIME
.with_label_values(&[op])
.with_label_values(&[op, &self.tenant_id, &self.timeline_id])
.observe_closure_duration(|| func(&file));
// Store the File in the slot and update the handle in the VirtualFile
@@ -334,11 +333,13 @@ impl Drop for VirtualFile {
let mut slot_guard = slot.inner.write().unwrap();
if slot_guard.tag == handle.tag {
slot.recently_used.store(false, Ordering::Relaxed);
// there is also operation "close-by-replace" for closes done on eviction for
// comparison.
// Unlike files evicted by replacement algorithm, here
// we group close time by tenant_id/timeline_id.
// At allows to compare number/time of "normal" file closes
// with file eviction.
STORAGE_IO_TIME
.with_label_values(&["close"])
.observe_closure_duration(|| drop(slot_guard.file.take()));
.with_label_values(&["close", &self.tenant_id, &self.timeline_id])
.observe_closure_duration(|| slot_guard.file.take());
}
}
}

View File

@@ -312,7 +312,7 @@ impl<'a> WalIngest<'a> {
// particular point in the WAL. For more fine-grained control,
// we could peek into the message and only pause if it contains
// a particular string, for example, but this is enough for now.
crate::failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
}
}

View File

@@ -4,7 +4,6 @@
MODULE_big = neon
OBJS = \
$(WIN32RES) \
extension_server.o \
file_cache.o \
libpagestore.o \
libpqwalproposer.o \

View File

@@ -1,103 +0,0 @@
/*-------------------------------------------------------------------------
*
* extension_server.c
* Request compute_ctl to download extension files.
*
* IDENTIFICATION
* contrib/neon/extension_server.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tcop/pquery.h"
#include "tcop/utility.h"
#include "access/xact.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "commands/defrem.h"
#include "miscadmin.h"
#include "utils/acl.h"
#include "fmgr.h"
#include "utils/guc.h"
#include "port.h"
#include "fmgr.h"
#include <curl/curl.h>
static int extension_server_port = 0;
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
// to download all SQL (and data) files for an extension:
// curl -X POST http://localhost:8080/extension_server/postgis
// it covers two possible extension files layouts:
// 1. extension_name--version--platform.sql
// 2. extension_name/extension_name--version.sql
// extension_name/extra_files.csv
//
// to download specific library file:
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
static bool
neon_download_extension_file_http(const char *filename, bool is_library)
{
CURL *curl;
CURLcode res;
char *compute_ctl_url;
char *postdata;
bool ret = false;
if ((curl = curl_easy_init()) == NULL)
{
elog(ERROR, "Failed to initialize curl handle");
}
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
extension_server_port, filename, is_library ? "?is_library=true" : "");
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
if (curl)
{
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* Check for errors */
if (res == CURLE_OK)
{
ret = true;
}
else
{
// Don't error here because postgres will try to find the file
// and will fail with some proper error message if it's not found.
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
}
/* always cleanup */
curl_easy_cleanup(curl);
}
return ret;
}
void pg_init_extension_server()
{
// Port to connect to compute_ctl on localhost
// to request extension files.
DefineCustomIntVariable("neon.extension_server_port",
"connection string to the compute_ctl",
NULL,
&extension_server_port,
0, 0, INT_MAX,
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
// set download_extension_file_hook
prev_download_extension_file_hook = download_extension_file_hook;
download_extension_file_hook = neon_download_extension_file_http;
}

View File

@@ -172,7 +172,7 @@ lfc_change_limit_hook(int newval, void *extra)
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
if (lfc_desc < 0) {
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
elog(LOG, "Failed to open file cache %s: %m", lfc_path);
lfc_size_limit = 0; /* disable file cache */
return;
}
@@ -557,7 +557,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
Assert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
elog(DEBUG2, "Swap file cache page");
elog(LOG, "Swap file cache page");
}
else
{
@@ -574,7 +574,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
if (lfc_desc < 0) {
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
elog(LOG, "Failed to open file cache %s: %m", lfc_path);
lfc_size_limit = 0; /* disable file cache */
}
}
@@ -583,7 +583,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
if (rc != BLCKSZ)
{
elog(WARNING, "Failed to write file cache: %m, disabling file cache");
elog(INFO, "Failed to write file cache: %m");
lfc_size_limit = 0; /* disable file cache */
}
}

View File

@@ -74,7 +74,7 @@ walprop_connect_start(char *conninfo, char *password)
if (password)
{
keywords[n] = "password";
values[n] = password;
values[n] = neon_auth_token;
n++;
}
keywords[n] = "dbname";
@@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
/*
* The docs for PQgetCopyData list the return values as: 0 if the copy is
* still in progress, but no "complete row" is available -1 if the copy is
* done -2 if an error occurred (> 0) if it was successful; that value is
* done -2 if an error occured (> 0) if it was successful; that value is
* the amount transferred.
*
* The protocol we use between walproposer and safekeeper means that we
@@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
/*
* The docs for PQputcopyData list the return values as: 1 if the data was
* queued, 0 if it was not queued because of full buffers, or -1 if an
* error occurred
* error occured
*/
result = PQputCopyData(conn->pg_conn, buf, size);

View File

@@ -35,11 +35,8 @@ _PG_init(void)
{
pg_init_libpagestore();
pg_init_walproposer();
InitControlPlaneConnector();
pg_init_extension_server();
// Important: This must happen after other parts of the extension
// are loaded, otherwise any settings to GUCs that were set before
// the extension was loaded will be removed.

View File

@@ -21,8 +21,6 @@ extern char *neon_tenant;
extern void pg_init_libpagestore(void);
extern void pg_init_walproposer(void);
extern void pg_init_extension_server(void);
/*
* Returns true if we shouldn't do REDO on that block in record indicated by
* block_id; false otherwise.

View File

@@ -163,7 +163,6 @@ static void nwp_register_gucs(void);
static void nwp_prepare_shmem(void);
static uint64 backpressure_lag_impl(void);
static bool backpressure_throttling_impl(void);
static uint64 measure_replication_lag(void);
static process_interrupts_callback_t PrevProcessInterruptsCallback;
static shmem_startup_hook_type prev_shmem_startup_hook_type;
@@ -172,8 +171,6 @@ static shmem_request_hook_type prev_shmem_request_hook = NULL;
static void walproposer_shmem_request(void);
#endif
static bool check_replication_lag;
void
pg_init_walproposer(void)
{
@@ -791,7 +788,7 @@ ReconnectSafekeepers(void)
/*
* Performs the logic for advancing the state machine of the specified safekeeper,
* given that a certain set of events has occurred.
* given that a certain set of events has occured.
*/
static void
AdvancePollState(Safekeeper *sk, uint32 events)
@@ -1396,22 +1393,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
char *err;
WalReceiverConn *wrconn;
WalRcvStreamOptions options;
char conninfo[MAXCONNINFO];
if (!neon_auth_token)
{
memcpy(conninfo, safekeeper[donor].conninfo, MAXCONNINFO);
}
else
{
int written = 0;
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, safekeeper[donor].conninfo);
if (written > MAXCONNINFO || written < 0)
elog(FATAL, "could not append password to the safekeeper connection string");
}
wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
if (!wrconn)
{
ereport(WARNING,
@@ -2495,45 +2478,37 @@ backpressure_lag_impl(void)
{
if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
{
check_replication_lag = true;
return measure_replication_lag();
}
return 0;
}
static uint64
measure_replication_lag(void)
{
XLogRecPtr writePtr;
XLogRecPtr flushPtr;
XLogRecPtr applyPtr;
XLogRecPtr writePtr;
XLogRecPtr flushPtr;
XLogRecPtr applyPtr;
#if PG_VERSION_NUM >= 150000
XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL);
XLogRecPtr myFlushLsn = GetFlushRecPtr(NULL);
#else
XLogRecPtr myFlushLsn = GetFlushRecPtr();
XLogRecPtr myFlushLsn = GetFlushRecPtr();
#endif
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
#define MB ((XLogRecPtr)1024 * 1024)
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
LSN_FORMAT_ARGS(myFlushLsn),
LSN_FORMAT_ARGS(writePtr),
LSN_FORMAT_ARGS(flushPtr),
LSN_FORMAT_ARGS(applyPtr));
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
LSN_FORMAT_ARGS(myFlushLsn),
LSN_FORMAT_ARGS(writePtr),
LSN_FORMAT_ARGS(flushPtr),
LSN_FORMAT_ARGS(applyPtr));
if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB))
{
return (myFlushLsn - writePtr - max_replication_write_lag * MB);
}
if ((writePtr != InvalidXLogRecPtr && max_replication_write_lag > 0 && myFlushLsn > writePtr + max_replication_write_lag * MB))
{
return (myFlushLsn - writePtr - max_replication_write_lag * MB);
}
if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB))
{
return (myFlushLsn - flushPtr - max_replication_flush_lag * MB);
}
if ((flushPtr != InvalidXLogRecPtr && max_replication_flush_lag > 0 && myFlushLsn > flushPtr + max_replication_flush_lag * MB))
{
return (myFlushLsn - flushPtr - max_replication_flush_lag * MB);
}
if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB))
{
return (myFlushLsn - applyPtr - max_replication_apply_lag * MB);
if ((applyPtr != InvalidXLogRecPtr && max_replication_apply_lag > 0 && myFlushLsn > applyPtr + max_replication_apply_lag * MB))
{
return (myFlushLsn - applyPtr - max_replication_apply_lag * MB);
}
}
return 0;
}
@@ -2550,18 +2525,14 @@ backpressure_throttling_impl(void)
? PrevProcessInterruptsCallback()
: false;
/* Throttle onlhy backends writing WAL. */
if (!check_replication_lag)
/* Don't throttle read only transactions and wal sender. */
if (am_walsender || !TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
return retry;
/* Calculate replication lag */
lag = measure_replication_lag();
/* Calculate replicas lag */
lag = backpressure_lag_impl();
if (lag == 0)
{
/* Do not measure replication lag before we writting something to the WAL */
check_replication_lag = false;
return retry;
}
/* Suspend writers until replicas catch up */
set_ps_display("backpressure throttling");

View File

@@ -23,7 +23,7 @@
* message header */
/*
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
* because all WL_* events are given flags equal to some (1 << i), starting from i = 0
*/
#define WL_NO_EVENTS 0
@@ -317,7 +317,7 @@ typedef struct AppendResponse
/* this is a criterion for walproposer --sync mode exit */
XLogRecPtr commitLsn;
HotStandbyFeedback hs;
/* Feedback received from pageserver includes standby_status_update fields */
/* Feedback recieved from pageserver includes standby_status_update fields */
/* and custom neon feedback. */
/* This part of the message is extensible. */
PageserverFeedback rf;

View File

@@ -37,14 +37,68 @@ static XLogSegNo walpropSegNo = 0;
/* START cloned file-local variables and functions from walsender.c */
/*
* xlogreader used for replication. Note that a WAL sender doing physical
* replication does not need xlogreader to read WAL, but it needs one to
* keep a state of its work.
*/
static XLogReaderState *xlogreader = NULL;
/*
* These variables keep track of the state of the timeline we're currently
* sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
* the timeline is not the latest timeline on this server, and the server's
* history forked off from that timeline at sendTimeLineValidUpto.
*/
static TimeLineID sendTimeLine = 0;
static TimeLineID sendTimeLineNextTLI = 0;
static bool sendTimeLineIsHistoric = false;
static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
/*
* Timestamp of last ProcessRepliesIfAny() that saw a reply from the
* standby. Set to 0 if wal_sender_timeout doesn't need to be active.
*/
static TimestampTz last_reply_timestamp = 0;
/* Have we sent a heartbeat message asking for reply, since last reply? */
static bool waiting_for_ping_response = false;
static bool streamingDoneSending;
static bool streamingDoneReceiving;
/* Are we there yet? */
static bool WalSndCaughtUp = false;
/* Flags set by signal handlers for later service in main loop */
static volatile sig_atomic_t got_STOPPING = false;
/*
* How far have we sent WAL already? This is also advertised in
* MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.)
*/
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
static void WalSndLoop(void);
static void XLogBroadcastWalProposer(void);
/*
* This is set while we are streaming. When not set
* PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
* the main loop is responsible for checking got_STOPPING and terminating when
* it's set (after streaming any remaining WAL).
*/
static volatile sig_atomic_t replication_active = false;
typedef void (*WalSndSendDataCallback) (void);
static void WalSndLoop(WalSndSendDataCallback send_data);
static void XLogSendPhysical(void);
#if PG_VERSION_NUM >= 150000
static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
#else
static XLogRecPtr GetStandbyFlushRecPtr(void);
#endif
static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
TimeLineID *tli_p);
/* END cloned file-level variables and functions from walsender.c */
int
@@ -452,7 +506,7 @@ XLogWalPropClose(XLogRecPtr recptr)
/* START of cloned functions from walsender.c */
/*
* Subscribe for new WAL and stream it in the loop to safekeepers.
* Handle START_REPLICATION command.
*
* At the moment, this never returns, but an ereport(ERROR) will take us back
* to the main loop.
@@ -470,6 +524,18 @@ StartProposerReplication(StartReplicationCmd *cmd)
errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
#endif
/* create xlogreader for physical replication */
xlogreader =
XLogReaderAllocate(wal_segment_size, NULL,
XL_ROUTINE(.segment_open = WalSndSegmentOpen,
.segment_close = wal_segment_close),
NULL);
if (!xlogreader)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
/*
* We assume here that we're logging enough information in the WAL for
* log-shipping, since this is checked in PostmasterMain().
@@ -503,61 +569,341 @@ StartProposerReplication(StartReplicationCmd *cmd)
* we keep this code around to lighten the load for when we need it.
*/
#if PG_VERSION_NUM >= 150000
FlushPtr = GetFlushRecPtr(&currTLI);
if (am_cascading_walsender)
{
/* this also updates ThisTimeLineID */
FlushPtr = GetStandbyFlushRecPtr(&currTLI);
}
else
FlushPtr = GetFlushRecPtr(&currTLI);
#else
FlushPtr = GetFlushRecPtr();
if (am_cascading_walsender)
{
/* this also updates ThisTimeLineID */
FlushPtr = GetStandbyFlushRecPtr();
}
else
FlushPtr = GetFlushRecPtr();
currTLI = ThisTimeLineID;
#endif
/*
* When we first start replication the standby will be behind the
* primary. For some applications, for example synchronous
* replication, it is important to have a clear state for this initial
* catchup mode, so we can trigger actions when we change streaming
* state later. We may stay in this state for a long time, which is
* exactly why we want to be able to monitor whether or not we are
* still here.
*/
WalSndSetState(WALSNDSTATE_CATCHUP);
/*
* Don't allow a request to stream from a future point in WAL that
* hasn't been flushed to disk in this server yet.
*/
if (FlushPtr < cmd->startpoint)
if (cmd->timeline != 0)
{
ereport(ERROR,
(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
LSN_FORMAT_ARGS(cmd->startpoint),
LSN_FORMAT_ARGS(FlushPtr))));
XLogRecPtr switchpoint;
sendTimeLine = cmd->timeline;
if (sendTimeLine == currTLI)
{
sendTimeLineIsHistoric = false;
sendTimeLineValidUpto = InvalidXLogRecPtr;
}
else
{
List *timeLineHistory;
sendTimeLineIsHistoric = true;
/*
* Check that the timeline the client requested exists, and the
* requested start location is on that timeline.
*/
timeLineHistory = readTimeLineHistory(currTLI);
switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
&sendTimeLineNextTLI);
list_free_deep(timeLineHistory);
/*
* Found the requested timeline in the history. Check that
* requested startpoint is on that timeline in our history.
*
* This is quite loose on purpose. We only check that we didn't
* fork off the requested timeline before the switchpoint. We
* don't check that we switched *to* it before the requested
* starting point. This is because the client can legitimately
* request to start replication from the beginning of the WAL
* segment that contains switchpoint, but on the new timeline, so
* that it doesn't end up with a partial segment. If you ask for
* too old a starting point, you'll get an error later when we
* fail to find the requested WAL segment in pg_wal.
*
* XXX: we could be more strict here and only allow a startpoint
* that's older than the switchpoint, if it's still in the same
* WAL segment.
*/
if (!XLogRecPtrIsInvalid(switchpoint) &&
switchpoint < cmd->startpoint)
{
ereport(ERROR,
(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
LSN_FORMAT_ARGS(cmd->startpoint),
cmd->timeline),
errdetail("This server's history forked from timeline %u at %X/%X.",
cmd->timeline,
LSN_FORMAT_ARGS(switchpoint))));
}
sendTimeLineValidUpto = switchpoint;
}
}
else
{
sendTimeLine = currTLI;
sendTimeLineValidUpto = InvalidXLogRecPtr;
sendTimeLineIsHistoric = false;
}
/* Start streaming from the requested point */
sentPtr = cmd->startpoint;
streamingDoneSending = streamingDoneReceiving = false;
/* Initialize shared memory status, too */
SpinLockAcquire(&MyWalSnd->mutex);
MyWalSnd->sentPtr = sentPtr;
SpinLockRelease(&MyWalSnd->mutex);
/* If there is nothing to stream, don't even enter COPY mode */
if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
{
/*
* When we first start replication the standby will be behind the
* primary. For some applications, for example synchronous
* replication, it is important to have a clear state for this initial
* catchup mode, so we can trigger actions when we change streaming
* state later. We may stay in this state for a long time, which is
* exactly why we want to be able to monitor whether or not we are
* still here.
*/
WalSndSetState(WALSNDSTATE_CATCHUP);
SyncRepInitConfig();
/*
* Don't allow a request to stream from a future point in WAL that
* hasn't been flushed to disk in this server yet.
*/
if (FlushPtr < cmd->startpoint)
{
ereport(ERROR,
(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
LSN_FORMAT_ARGS(cmd->startpoint),
LSN_FORMAT_ARGS(FlushPtr))));
}
/* Infinite send loop, never returns */
WalSndLoop();
/* Start streaming from the requested point */
sentPtr = cmd->startpoint;
WalSndSetState(WALSNDSTATE_STARTUP);
/* Initialize shared memory status, too */
SpinLockAcquire(&MyWalSnd->mutex);
MyWalSnd->sentPtr = sentPtr;
SpinLockRelease(&MyWalSnd->mutex);
SyncRepInitConfig();
/* Main loop of walsender */
replication_active = true;
WalSndLoop(XLogSendPhysical);
replication_active = false;
if (got_STOPPING)
proc_exit(0);
WalSndSetState(WALSNDSTATE_STARTUP);
Assert(streamingDoneSending && streamingDoneReceiving);
}
if (cmd->slotname)
ReplicationSlotRelease();
/*
* Copy is finished now. Send a single-row result set indicating the next
* timeline.
*/
if (sendTimeLineIsHistoric)
{
char startpos_str[8 + 1 + 8 + 1];
DestReceiver *dest;
TupOutputState *tstate;
TupleDesc tupdesc;
Datum values[2];
bool nulls[2];
snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
LSN_FORMAT_ARGS(sendTimeLineValidUpto));
dest = CreateDestReceiver(DestRemoteSimple);
MemSet(nulls, false, sizeof(nulls));
/*
* Need a tuple descriptor representing two columns. int8 may seem
* like a surprising data type for this, but in theory int4 would not
* be wide enough for this, as TimeLineID is unsigned.
*/
tupdesc = CreateTemplateTupleDesc(2);
TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
INT8OID, -1, 0);
TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
TEXTOID, -1, 0);
/* prepare for projection of tuple */
tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
values[1] = CStringGetTextDatum(startpos_str);
/* send it to dest */
do_tup_output(tstate, values, nulls);
end_tup_output(tstate);
}
/* Send CommandComplete message */
EndReplicationCommand("START_STREAMING");
}
/*
* Main loop that waits for LSN updates and calls the walproposer.
* Synchronous replication sets latch in WalSndWakeup at walsender.c
*/
static void
WalSndLoop(void)
#if PG_VERSION_NUM >= 150000
static XLogRecPtr
GetStandbyFlushRecPtr(TimeLineID *tli)
{
XLogRecPtr replayPtr;
TimeLineID replayTLI;
XLogRecPtr receivePtr;
TimeLineID receiveTLI;
XLogRecPtr result;
/*
* We can safely send what's already been replayed. Also, if walreceiver
* is streaming WAL from the same timeline, we can send anything that it
* has streamed, but hasn't been replayed yet.
*/
receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
replayPtr = GetXLogReplayRecPtr(&replayTLI);
*tli = replayTLI;
result = replayPtr;
if (receiveTLI == replayTLI && receivePtr > replayPtr)
result = receivePtr;
return result;
}
#else
/*
* Returns the latest point in WAL that has been safely flushed to disk, and
* can be sent to the standby. This should only be called when in recovery,
* ie. we're streaming to a cascaded standby.
*
* As a side-effect, ThisTimeLineID is updated to the TLI of the last
* replayed WAL record.
*/
static XLogRecPtr
GetStandbyFlushRecPtr(void)
{
XLogRecPtr replayPtr;
TimeLineID replayTLI;
XLogRecPtr receivePtr;
TimeLineID receiveTLI;
XLogRecPtr result;
/*
* We can safely send what's already been replayed. Also, if walreceiver
* is streaming WAL from the same timeline, we can send anything that it
* has streamed, but hasn't been replayed yet.
*/
receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
replayPtr = GetXLogReplayRecPtr(&replayTLI);
ThisTimeLineID = replayTLI;
result = replayPtr;
if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
result = receivePtr;
return result;
}
#endif
/* XLogReaderRoutine->segment_open callback */
static void
WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
TimeLineID *tli_p)
{
char path[MAXPGPATH];
/*-------
* When reading from a historic timeline, and there is a timeline switch
* within this segment, read from the WAL segment belonging to the new
* timeline.
*
* For example, imagine that this server is currently on timeline 5, and
* we're streaming timeline 4. The switch from timeline 4 to 5 happened at
* 0/13002088. In pg_wal, we have these files:
*
* ...
* 000000040000000000000012
* 000000040000000000000013
* 000000050000000000000013
* 000000050000000000000014
* ...
*
* In this situation, when requested to send the WAL from segment 0x13, on
* timeline 4, we read the WAL from file 000000050000000000000013. Archive
* recovery prefers files from newer timelines, so if the segment was
* restored from the archive on this server, the file belonging to the old
* timeline, 000000040000000000000013, might not exist. Their contents are
* equal up to the switchpoint, because at a timeline switch, the used
* portion of the old segment is copied to the new file. -------
*/
*tli_p = sendTimeLine;
if (sendTimeLineIsHistoric)
{
XLogSegNo endSegNo;
XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
if (nextSegNo == endSegNo)
*tli_p = sendTimeLineNextTLI;
}
XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
if (state->seg.ws_file >= 0)
return;
/*
* If the file is not found, assume it's because the standby asked for a
* too old WAL segment that has already been removed or recycled.
*/
if (errno == ENOENT)
{
char xlogfname[MAXFNAMELEN];
int save_errno = errno;
XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
errno = save_errno;
ereport(ERROR,
(errcode_for_file_access(),
errmsg("requested WAL segment %s has already been removed",
xlogfname)));
}
else
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m",
path)));
}
/* Main loop of walsender process that streams the WAL over Copy messages. */
static void
WalSndLoop(WalSndSendDataCallback send_data)
{
/*
* Initialize the last reply timestamp. That enables timeout processing
* from hereon.
*/
last_reply_timestamp = GetCurrentTimestamp();
waiting_for_ping_response = false;
/*
* Loop until we reach the end of this timeline or the client requests to
* stop streaming.
*/
for (;;)
{
/* Clear any already-pending wakeups */
@@ -565,41 +911,153 @@ WalSndLoop(void)
CHECK_FOR_INTERRUPTS();
XLogBroadcastWalProposer();
/* Process any requests or signals received recently */
if (ConfigReloadPending)
{
ConfigReloadPending = false;
ProcessConfigFile(PGC_SIGHUP);
SyncRepInitConfig();
}
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
WalSndSetState(WALSNDSTATE_STREAMING);
WalProposerPoll();
/* always true */
if (am_wal_proposer)
{
send_data();
if (WalSndCaughtUp)
{
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
WalSndSetState(WALSNDSTATE_STREAMING);
WalProposerPoll();
WalSndCaughtUp = false;
}
continue;
}
}
}
/*
* Notify walproposer about the new WAL position.
* Send out the WAL in its normal physical/stored form.
*
* Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
* but not yet sent to the client, and buffer it in the libpq output
* buffer.
*
* If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
* otherwise WalSndCaughtUp is set to false.
*/
static void
XLogBroadcastWalProposer(void)
XLogSendPhysical(void)
{
XLogRecPtr SendRqstPtr;
XLogRecPtr startptr;
XLogRecPtr endptr;
Size nbytes PG_USED_FOR_ASSERTS_ONLY;
TimeLineID currTLI;
/* Start from the last sent position */
startptr = sentPtr;
/* If requested switch the WAL sender to the stopping state. */
if (got_STOPPING)
WalSndSetState(WALSNDSTATE_STOPPING);
/*
* Streaming the current timeline on a primary.
*
* Attempt to send all data that's already been written out and
* fsync'd to disk. We cannot go further than what's been written out
* given the current implementation of WALRead(). And in any case
* it's unsafe to send WAL that is not securely down to disk on the
* primary: if the primary subsequently crashes and restarts, standbys
* must not have applied any WAL that got lost on the primary.
*/
if (streamingDoneSending)
{
WalSndCaughtUp = true;
return;
}
/* Figure out how far we can safely send the WAL. */
if (sendTimeLineIsHistoric)
{
/*
* Streaming an old timeline that's in this server's history, but is
* not the one we're currently inserting or replaying. It can be
* streamed up to the point where we switched off that timeline.
*/
SendRqstPtr = sendTimeLineValidUpto;
}
else if (am_cascading_walsender)
{
/*
* Streaming the latest timeline on a standby.
*
* Attempt to send all WAL that has already been replayed, so that we
* know it's valid. If we're receiving WAL through streaming
* replication, it's also OK to send any WAL that has been received
* but not replayed.
*
* The timeline we're recovering from can change, or we can be
* promoted. In either case, the current timeline becomes historic. We
* need to detect that so that we don't try to stream past the point
* where we switched to another timeline. We check for promotion or
* timeline switch after calculating FlushPtr, to avoid a race
* condition: if the timeline becomes historic just after we checked
* that it was still current, it's still be OK to stream it up to the
* FlushPtr that was calculated before it became historic.
*/
bool becameHistoric = false;
#if PG_VERSION_NUM >= 150000
endptr = GetFlushRecPtr(NULL);
SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
#else
endptr = GetFlushRecPtr();
SendRqstPtr = GetStandbyFlushRecPtr();
currTLI = ThisTimeLineID;
#endif
if (!RecoveryInProgress())
{
/*
* We have been promoted. RecoveryInProgress() updated
* ThisTimeLineID to the new current timeline.
*/
am_cascading_walsender = false;
becameHistoric = true;
}
else
{
/*
* Still a cascading standby. But is the timeline we're sending
* still the one recovery is recovering from? currTLI was updated
* by the GetStandbyFlushRecPtr() call above.
*/
if (sendTimeLine != currTLI)
becameHistoric = true;
}
if (becameHistoric)
{
/*
* The timeline we were sending has become historic. Read the
* timeline history file of the new timeline to see where exactly
* we forked off from the timeline we were sending.
*/
List *history;
history = readTimeLineHistory(currTLI);
sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
Assert(sendTimeLine < sendTimeLineNextTLI);
list_free_deep(history);
sendTimeLineIsHistoric = true;
SendRqstPtr = sendTimeLineValidUpto;
}
}
else
{
/*
* Streaming the current timeline on a primary.
*
* Attempt to send all data that's already been written out and
* fsync'd to disk. We cannot go further than what's been written out
* given the current implementation of WALRead(). And in any case
* it's unsafe to send WAL that is not securely down to disk on the
* primary: if the primary subsequently crashes and restarts, standbys
* must not have applied any WAL that got lost on the primary.
*/
#if PG_VERSION_NUM >= 150000
SendRqstPtr = GetFlushRecPtr(NULL);
#else
SendRqstPtr = GetFlushRecPtr();
#endif
}
/*
* Record the current system time as an approximation of the time at which
@@ -625,14 +1083,91 @@ XLogBroadcastWalProposer(void)
* that arbitrary LSN is eventually reported as written, flushed and
* applied, so that it can measure the elapsed time.
*/
LagTrackerWrite(endptr, GetCurrentTimestamp());
LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
/*
* If this is a historic timeline and we've reached the point where we
* forked to the next timeline, stop streaming.
*
* Note: We might already have sent WAL > sendTimeLineValidUpto. The
* startup process will normally replay all WAL that has been received
* from the primary, before promoting, but if the WAL streaming is
* terminated at a WAL page boundary, the valid portion of the timeline
* might end in the middle of a WAL record. We might've already sent the
* first half of that partial WAL record to the cascading standby, so that
* sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
* replay the partial WAL record either, so it can still follow our
* timeline switch.
*/
if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
{
/* close the current file. */
if (xlogreader->seg.ws_file >= 0)
wal_segment_close(xlogreader);
/* Send CopyDone */
pq_putmessage_noblock('c', NULL, 0);
streamingDoneSending = true;
WalSndCaughtUp = true;
elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
LSN_FORMAT_ARGS(sendTimeLineValidUpto),
LSN_FORMAT_ARGS(sentPtr));
return;
}
/* Do we have any work to do? */
Assert(startptr <= endptr);
if (endptr <= startptr)
Assert(sentPtr <= SendRqstPtr);
if (SendRqstPtr <= sentPtr)
{
WalSndCaughtUp = true;
return;
}
WalProposerBroadcast(startptr, endptr);
/*
* Figure out how much to send in one message. If there's no more than
* MAX_SEND_SIZE bytes to send, send everything. Otherwise send
* MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
*
* The rounding is not only for performance reasons. Walreceiver relies on
* the fact that we never split a WAL record across two messages. Since a
* long WAL record is split at page boundary into continuation records,
* page boundary is always a safe cut-off point. We also assume that
* SendRqstPtr never points to the middle of a WAL record.
*/
startptr = sentPtr;
endptr = startptr;
endptr += MAX_SEND_SIZE;
/* if we went beyond SendRqstPtr, back off */
if (SendRqstPtr <= endptr)
{
endptr = SendRqstPtr;
if (sendTimeLineIsHistoric)
WalSndCaughtUp = false;
else
WalSndCaughtUp = true;
}
else
{
/* round down to page boundary. */
endptr -= (endptr % XLOG_BLCKSZ);
WalSndCaughtUp = false;
}
nbytes = endptr - startptr;
Assert(nbytes <= MAX_SEND_SIZE);
/* always true */
if (am_wal_proposer)
{
WalProposerBroadcast(startptr, endptr);
}
else
{
/* code removed for brevity */
}
sentPtr = endptr;
/* Update shared memory status */

54
poetry.lock generated
View File

@@ -740,13 +740,13 @@ typing-extensions = ">=4.1.0"
[[package]]
name = "certifi"
version = "2023.7.22"
version = "2022.12.7"
description = "Python package for providing Mozilla's CA Bundle."
optional = false
python-versions = ">=3.6"
files = [
{file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
{file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
{file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
{file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
]
[[package]]
@@ -887,34 +887,34 @@ files = [
[[package]]
name = "cryptography"
version = "41.0.3"
version = "41.0.2"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
optional = false
python-versions = ">=3.7"
files = [
{file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
{file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
{file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
{file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
{file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
{file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
{file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
{file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"},
{file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"},
{file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"},
{file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"},
{file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"},
{file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"},
{file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"},
{file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"},
{file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"},
{file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"},
{file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"},
{file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"},
{file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"},
{file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"},
{file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"},
{file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"},
{file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"},
{file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"},
{file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"},
{file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"},
{file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"},
{file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"},
{file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"},
]
[package.dependencies]

View File

@@ -13,7 +13,6 @@ bytes = { workspace = true, features = ["serde"] }
chrono.workspace = true
clap.workspace = true
consumption_metrics.workspace = true
dashmap.workspace = true
futures.workspace = true
git-version.workspace = true
hashbrown.workspace = true
@@ -30,7 +29,7 @@ metrics.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
parking_lot.workspace = true
pbkdf2 = { workspace = true, features = ["simple", "std"] }
pbkdf2.workspace = true
pin-project-lite.workspace = true
postgres_backend.workspace = true
pq_proto.workspace = true

Some files were not shown because too many files have changed in this diff Show More