mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 06:30:43 +00:00
Compare commits
93 Commits
alek_targz
...
sergey/pas
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77e54626e7 | ||
|
|
ef8ce4337b | ||
|
|
ca4d71a954 | ||
|
|
381f41e685 | ||
|
|
d005c77ea3 | ||
|
|
04776ade6c | ||
|
|
c3fe335eaf | ||
|
|
3a00a5deb2 | ||
|
|
78fa2b13e5 | ||
|
|
7c076edeea | ||
|
|
69528b7c30 | ||
|
|
a98a80abc2 | ||
|
|
7b6c849456 | ||
|
|
326189d950 | ||
|
|
ddbe170454 | ||
|
|
39e458f049 | ||
|
|
e1424647a0 | ||
|
|
705ae2dce9 | ||
|
|
eb78603121 | ||
|
|
f0ad603693 | ||
|
|
e5183f85dc | ||
|
|
89ee8f2028 | ||
|
|
a8f3540f3d | ||
|
|
4338eed8c4 | ||
|
|
2fbdf26094 | ||
|
|
7374634845 | ||
|
|
9fdd3a4a1e | ||
|
|
3681fc39fd | ||
|
|
67d2fa6dec | ||
|
|
cafbe8237e | ||
|
|
3e425c40c0 | ||
|
|
395bd9174e | ||
|
|
b9a7a661d0 | ||
|
|
48ce95533c | ||
|
|
874c31976e | ||
|
|
231d7a7616 | ||
|
|
5705413d90 | ||
|
|
35370f967f | ||
|
|
b98419ee56 | ||
|
|
86a61b318b | ||
|
|
5f8fd640bf | ||
|
|
916a5871a6 | ||
|
|
700d929529 | ||
|
|
520046f5bd | ||
|
|
2ebd2ce2b6 | ||
|
|
bcc2aee704 | ||
|
|
6d023484ed | ||
|
|
062159ac17 | ||
|
|
f2e2b8a7f4 | ||
|
|
f9214771b4 | ||
|
|
77a68326c5 | ||
|
|
a25504deae | ||
|
|
294b8a8fde | ||
|
|
407a20ceae | ||
|
|
e5b7ddfeee | ||
|
|
7feb0d1a80 | ||
|
|
457e3a3ebc | ||
|
|
25d2f4b669 | ||
|
|
1685593f38 | ||
|
|
8d0f4a7857 | ||
|
|
3fc3666df7 | ||
|
|
89746a48c6 | ||
|
|
8d27a9c54e | ||
|
|
d98cb39978 | ||
|
|
27c73c8740 | ||
|
|
9e871318a0 | ||
|
|
e1061879aa | ||
|
|
f09e82270e | ||
|
|
d4a5fd5258 | ||
|
|
921bb86909 | ||
|
|
1e7db5458f | ||
|
|
b4d36f572d | ||
|
|
762a8a7bb5 | ||
|
|
2e8a3afab1 | ||
|
|
4580f5085a | ||
|
|
e074ccf170 | ||
|
|
196943c78f | ||
|
|
149dd36b6b | ||
|
|
be271e3edf | ||
|
|
7c85c7ea91 | ||
|
|
1066bca5e3 | ||
|
|
1aad8918e1 | ||
|
|
966213f429 | ||
|
|
35e73759f5 | ||
|
|
48936d44f8 | ||
|
|
2eae0a1fe5 | ||
|
|
53470ad12a | ||
|
|
edccef4514 | ||
|
|
982fce1e72 | ||
|
|
e767ced8d0 | ||
|
|
1309571f5d | ||
|
|
9a69b6cb94 | ||
|
|
cc82cd1b07 |
@@ -12,6 +12,11 @@ opt-level = 3
|
||||
# Turn on a small amount of optimization in Development mode.
|
||||
opt-level = 1
|
||||
|
||||
[build]
|
||||
# This is only present for local builds, as it will be overridden
|
||||
# by the RUSTDOCFLAGS env var in CI.
|
||||
rustdocflags = ["-Arustdoc::private_intra_doc_links"]
|
||||
|
||||
[alias]
|
||||
build_testing = ["build", "--features", "testing"]
|
||||
neon = ["run", "--bin", "neon_local"]
|
||||
|
||||
@@ -21,4 +21,5 @@
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!scripts/combine_control_files.py
|
||||
!vm-cgconfig.conf
|
||||
|
||||
10
.github/actions/run-python-test-set/action.yml
vendored
10
.github/actions/run-python-test-set/action.yml
vendored
@@ -150,6 +150,14 @@ runs:
|
||||
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
# We use pytest-split plugin to run benchmarks in parallel on different CI runners
|
||||
if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
|
||||
mkdir -p $TEST_OUTPUT
|
||||
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
|
||||
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
@@ -201,4 +209,4 @@ runs:
|
||||
uses: ./.github/actions/allure-report-store
|
||||
with:
|
||||
report-dir: /tmp/test_output/allure/results
|
||||
unique-key: ${{ inputs.build_type }}
|
||||
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
|
||||
|
||||
55
.github/workflows/approved-for-ci-run.yml
vendored
Normal file
55
.github/workflows/approved-for-ci-run.yml
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
name: Handle `approved-for-ci-run` label
|
||||
# This workflow helps to run CI pipeline for PRs made by external contributors (from forks).
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
# Default types that triggers a workflow ([1]):
|
||||
# - [1] https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request
|
||||
- opened
|
||||
- synchronize
|
||||
- reopened
|
||||
# Types that we wand to handle in addition to keep labels tidy:
|
||||
- closed
|
||||
# Actual magic happens here:
|
||||
- labeled
|
||||
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
|
||||
jobs:
|
||||
remove-label:
|
||||
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
|
||||
# The PR should be reviewed and labelled manually again.
|
||||
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
if: |
|
||||
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
create-branch:
|
||||
# Create a local branch for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
|
||||
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
if: |
|
||||
github.event.action == 'labeled' &&
|
||||
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
|
||||
|
||||
steps:
|
||||
- run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run"
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: main
|
||||
|
||||
- run: gh pr checkout "${PR_NUMBER}"
|
||||
|
||||
- run: git checkout -b "ci-run/pr-${PR_NUMBER}"
|
||||
|
||||
- run: git push --force origin "ci-run/pr-${PR_NUMBER}"
|
||||
58
.github/workflows/build_and_test.yml
vendored
58
.github/workflows/build_and_test.yml
vendored
@@ -5,6 +5,7 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release
|
||||
- ci-run/pr-*
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
@@ -127,6 +128,11 @@ jobs:
|
||||
- name: Run cargo clippy (release)
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
run: cargo doc --workspace --no-deps --document-private-items
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -390,13 +396,11 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pytest_split_group: [ 1, 2, 3, 4 ]
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Pytest benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -405,9 +409,11 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
@@ -788,7 +794,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.12.1
|
||||
VM_BUILDER_VERSION: v0.13.1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -949,22 +955,15 @@ jobs:
|
||||
version: [ v14, v15 ]
|
||||
|
||||
env:
|
||||
# While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
|
||||
# Later all the extensions will be moved to extensions image.
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
|
||||
COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
|
||||
S3_BUCKETS: |
|
||||
${{ github.ref_name == 'release' &&
|
||||
'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
|
||||
'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
|
||||
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
|
||||
|
||||
steps:
|
||||
- name: Pull postgres-extensions image
|
||||
run: |
|
||||
docker pull ${EXTENSIONS_IMAGE}
|
||||
docker pull ${COMPUTE_NODE_IMAGE}
|
||||
|
||||
- name: Create postgres-extensions container
|
||||
id: create-container
|
||||
@@ -972,44 +971,23 @@ jobs:
|
||||
EID=$(docker create ${EXTENSIONS_IMAGE} true)
|
||||
echo "EID=${EID}" >> $GITHUB_OUTPUT
|
||||
|
||||
CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
|
||||
echo "CID=${CID}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Extract postgres-extensions from container
|
||||
run: |
|
||||
rm -rf ./extensions-to-upload ./custom-extensions # Just in case
|
||||
rm -rf ./extensions-to-upload # Just in case
|
||||
mkdir -p extensions-to-upload
|
||||
|
||||
# In compute image we have a bit different directory layout
|
||||
mkdir -p extensions-to-upload/share
|
||||
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
|
||||
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib
|
||||
|
||||
# Delete Neon extensitons (they always present on compute-node image)
|
||||
rm -rf ./extensions-to-upload/share/extension/neon*
|
||||
rm -rf ./extensions-to-upload/lib/neon*
|
||||
|
||||
# Delete leftovers from the extension build step
|
||||
rm -rf ./extensions-to-upload/lib/pgxs
|
||||
rm -rf ./extensions-to-upload/lib/pkgconfig
|
||||
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
|
||||
for EXT_NAME in $(ls ./custom-extensions); do
|
||||
mkdir -p ./extensions-to-upload/${EXT_NAME}/share
|
||||
|
||||
mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
|
||||
mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib
|
||||
done
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
|
||||
|
||||
- name: Upload postgres-extensions to S3
|
||||
run: |
|
||||
for BUCKET in $(echo ${S3_BUCKETS}); do
|
||||
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
|
||||
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
|
||||
done
|
||||
|
||||
- name: Cleanup
|
||||
if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
|
||||
if: ${{ always() && steps.create-container.outputs.EID }}
|
||||
run: |
|
||||
docker rm ${{ steps.create-container.outputs.CID }} || true
|
||||
docker rm ${{ steps.create-container.outputs.EID }} || true
|
||||
|
||||
deploy:
|
||||
|
||||
3
.github/workflows/neon_extra_builds.yml
vendored
3
.github/workflows/neon_extra_builds.yml
vendored
@@ -3,7 +3,8 @@ name: Check neon with extra platform builds
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- main
|
||||
- ci-run/pr-*
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
|
||||
138
Cargo.lock
generated
138
Cargo.lock
generated
@@ -740,6 +740,9 @@ name = "cc"
|
||||
version = "1.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
@@ -922,6 +925,7 @@ dependencies = [
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1975,6 +1979,15 @@ version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.63"
|
||||
@@ -2382,9 +2395,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
|
||||
checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
|
||||
dependencies = [
|
||||
"opentelemetry_api",
|
||||
"opentelemetry_sdk",
|
||||
@@ -2392,9 +2405,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-http"
|
||||
version = "0.7.0"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
|
||||
checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -2405,9 +2418,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.11.0"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
|
||||
checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures",
|
||||
@@ -2423,48 +2436,47 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-proto"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
|
||||
checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"futures-util",
|
||||
"opentelemetry",
|
||||
"prost",
|
||||
"tonic 0.8.3",
|
||||
"tonic-build 0.8.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-semantic-conventions"
|
||||
version = "0.10.0"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
|
||||
checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_api"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
|
||||
checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"futures-channel",
|
||||
"futures-util",
|
||||
"indexmap",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
|
||||
checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"crossbeam-channel",
|
||||
@@ -2510,6 +2522,7 @@ dependencies = [
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
"svg_fmt",
|
||||
"tokio",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -2548,6 +2561,7 @@ dependencies = [
|
||||
"metrics",
|
||||
"nix",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pin-project-lite",
|
||||
@@ -2784,7 +2798,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2797,7 +2811,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -2808,7 +2822,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -2826,7 +2840,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2940,9 +2954,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.58"
|
||||
version = "1.0.64"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
|
||||
checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -3331,9 +3345,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest-tracing"
|
||||
version = "0.4.4"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
|
||||
checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -3858,7 +3872,8 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.4"
|
||||
source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
@@ -4001,7 +4016,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.9.2",
|
||||
"tonic-build 0.9.2",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -4102,7 +4117,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"libc",
|
||||
"xattr",
|
||||
"xattr 0.2.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4315,7 +4330,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -4383,16 +4398,17 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tar"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"futures-core",
|
||||
"libc",
|
||||
"redox_syscall 0.2.16",
|
||||
"redox_syscall 0.3.5",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"xattr",
|
||||
"xattr 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4519,19 +4535,6 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5bf5e9b9c0f7e0a7c027dcfaba7b2c60816c7049171f679d99ee2ff65d0de8c4"
|
||||
dependencies = [
|
||||
"prettyplease 0.1.25",
|
||||
"proc-macro2",
|
||||
"prost-build",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tonic-build"
|
||||
version = "0.9.2"
|
||||
@@ -4655,9 +4658,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.18.0"
|
||||
version = "0.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
|
||||
checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"opentelemetry",
|
||||
@@ -4882,6 +4885,7 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
@@ -5308,6 +5312,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"cc",
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_builder",
|
||||
@@ -5379,6 +5384,15 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xmlparser"
|
||||
version = "0.13.5"
|
||||
@@ -5399,3 +5413,33 @@ name = "zeroize"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.12.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "6.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.8+zstd.1.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
29
Cargo.toml
29
Cargo.toml
@@ -84,9 +84,9 @@ notify = "5.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.18.0"
|
||||
opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.10.0"
|
||||
opentelemetry = "0.19.0"
|
||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.11.0"
|
||||
parking_lot = "0.12"
|
||||
pbkdf2 = "0.12.1"
|
||||
pin-project-lite = "0.2"
|
||||
@@ -95,7 +95,7 @@ prost = "0.11"
|
||||
rand = "0.8"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
reqwest-retry = "0.2.2"
|
||||
routerify = "3"
|
||||
@@ -124,13 +124,14 @@ tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.9.0"
|
||||
tokio-rustls = "0.23"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.18.0"
|
||||
tracing-opentelemetry = "0.19.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
@@ -143,12 +144,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -183,12 +183,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
|
||||
# Changes the MAX_THREADS limit from 4096 to 32768.
|
||||
# This is a temporary workaround for using tracing from many threads in safekeepers code,
|
||||
# until async safekeepers patch is merged to the main.
|
||||
sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -77,6 +77,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
|
||||
echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -89,17 +90,28 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||
mkdir -p /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
|
||||
|
||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
||||
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -419,12 +431,16 @@ RUN apt-get update && \
|
||||
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
|
||||
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -535,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
|
||||
# There is no release tag yet
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -553,16 +567,17 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sort > /before.txt && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
find /usr/local/pgsql -type f | sort > /after.txt && \
|
||||
/bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -754,16 +769,23 @@ RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
# Extenstion only
|
||||
#
|
||||
#########################################################################################
|
||||
FROM python:3.9-slim-bullseye AS generate-ext-index
|
||||
ARG PG_VERSION
|
||||
ARG BUILD_TAG
|
||||
RUN apt update && apt install -y zstd
|
||||
|
||||
# copy the control files here
|
||||
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
|
||||
COPY --from=pg-anon-pg-build /extensions/ /extensions/
|
||||
COPY --from=postgis-build /extensions/ /extensions/
|
||||
COPY scripts/combine_control_files.py ./combine_control_files.py
|
||||
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
|
||||
|
||||
FROM scratch AS postgres-extensions
|
||||
# After the transition this layer will include all extensitons.
|
||||
# As for now, it's only for new custom ones
|
||||
#
|
||||
# # Default extensions
|
||||
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
|
||||
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib
|
||||
# Custom extensions
|
||||
COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
|
||||
COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
|
||||
# As for now, it's only a couple for testing purposses
|
||||
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
|
||||
COPY --from=generate-ext-index /ext_index.json /ext_index.json
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
2
Makefile
2
Makefile
@@ -108,6 +108,8 @@ postgres-%: postgres-configure-% \
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
|
||||
+@echo "Compiling pageinspect $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
|
||||
+@echo "Compiling amcheck $*"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
|
||||
|
||||
.PHONY: postgres-clean-%
|
||||
postgres-clean-%:
|
||||
|
||||
@@ -34,3 +34,4 @@ utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
zstd = "0.12.4"
|
||||
|
||||
@@ -30,7 +30,7 @@
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r {"bucket": "my-bucket", "region": "eu-central-1", "endpoint": "http:://localhost:9000"} \
|
||||
//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -38,7 +38,7 @@ use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -51,7 +51,6 @@ use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::launch_download_extensions;
|
||||
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
@@ -59,43 +58,26 @@ use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
const BUILD_TAG_DEFAULT: &str = "local";
|
||||
const DEFAULT_REMOTE_EXT_CONFIG: &str = r#"{"bucket": "neon-dev-extensions", "region": "eu-central-1", "endpoint": null, "prefix": "5555"}"#;
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
// in-case of not-set environment var
|
||||
const BUILD_TAG_DEFAULT: &str = "5670669815";
|
||||
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
|
||||
let build_tag = option_env!("BUILD_TAG")
|
||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||
.to_string();
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let remote_ext_config = matches
|
||||
.get_one::<String>("remote-ext-config")
|
||||
.map(|x| x.to_string());
|
||||
// let remote_ext_config =
|
||||
// Some(remote_ext_config.unwrap_or(DEFAULT_REMOTE_EXT_CONFIG.to_string()));
|
||||
|
||||
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
|
||||
let ext_remote_storage = remote_ext_config.map(|x| {
|
||||
init_remote_storage(&x, build_tag)
|
||||
.expect("cannot initialize remote extension storage from config")
|
||||
init_remote_storage(x).expect("cannot initialize remote extension storage from config")
|
||||
});
|
||||
// creds used to connect to remote extensions bucket
|
||||
// let aws_creds = matches.get_one::<String>("awscreds");
|
||||
// if let Some(aws_creds) = aws_creds {
|
||||
// // not sure if this is a bad idea?
|
||||
// let aws_creds_dict: serde_json::Value = serde_json::from_str(aws_creds)?;
|
||||
// std::env::set_var(
|
||||
// "AWS_ACCESS_KEY_ID",
|
||||
// aws_creds_dict["ID"].as_str().expect("config parse error"),
|
||||
// );
|
||||
// std::env::set_var(
|
||||
// "AWS_SECRET_ACCESS_KEY",
|
||||
// aws_creds_dict["key"].as_str().expect("config parse error"),
|
||||
// );
|
||||
// }
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
@@ -214,7 +196,10 @@ fn main() -> Result<()> {
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage,
|
||||
available_extensions: OnceLock::new(),
|
||||
ext_remote_paths: OnceLock::new(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
library_index: OnceLock::new(),
|
||||
build_tag,
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -228,6 +213,13 @@ fn main() -> Result<()> {
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
|
||||
// TODO this can stall startups in the unlikely event that we bind
|
||||
// this compute node while it's busy prewarming. It's not too
|
||||
// bad because it's just 100ms and unlikely, but it's an
|
||||
// avoidable problem.
|
||||
compute.prewarm_postgres()?;
|
||||
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
while state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
@@ -258,12 +250,8 @@ fn main() -> Result<()> {
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
let _configurator_handle =
|
||||
launch_configurator(&compute).expect("cannot launch configurator thread");
|
||||
|
||||
let _download_extensions_handle =
|
||||
launch_download_extensions(&compute).expect("cannot launch download extensions thread");
|
||||
let _monitor_handle = launch_monitor(&compute);
|
||||
let _configurator_handle = launch_configurator(&compute);
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
@@ -403,12 +391,6 @@ fn cli() -> clap::Command {
|
||||
.long("remote-ext-config")
|
||||
.value_name("REMOTE_EXT_CONFIG"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("awscreds")
|
||||
.short('k')
|
||||
.long("awscreds")
|
||||
.value_name("AWS_CREDENTIALS"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,19 +1,22 @@
|
||||
use std::collections::HashSet;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Condvar, Mutex, OnceLock};
|
||||
use std::sync::{Condvar, Mutex, OnceLock, RwLock};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use postgres::{Client, NoTls};
|
||||
use regex::Regex;
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{info, instrument, warn};
|
||||
use tracing::{error, info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -21,10 +24,11 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
use crate::{config, extension_server};
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
@@ -55,8 +59,22 @@ pub struct ComputeNode {
|
||||
pub state_changed: Condvar,
|
||||
/// the S3 bucket that we search for extensions in
|
||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||
// cached lists of available extensions and libraries
|
||||
pub available_extensions: OnceLock<HashSet<String>>,
|
||||
// (key: extension name, value: path to extension archive in remote storage)
|
||||
pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
|
||||
// (key: library name, value: name of extension containing this library)
|
||||
pub library_index: OnceLock<HashMap<String, String>>,
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
}
|
||||
|
||||
// store some metrics about download size that might impact startup time
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RemoteExtensionMetrics {
|
||||
num_ext_downloaded: u64,
|
||||
largest_ext_size: u64,
|
||||
total_ext_download_size: u64,
|
||||
prep_extensions_ms: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -96,6 +114,7 @@ pub struct ParsedSpec {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub pageserver_connstr: String,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
}
|
||||
|
||||
@@ -113,6 +132,21 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
|
||||
.ok_or("pageserver connstr should be provided")?;
|
||||
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
|
||||
if matches!(spec.mode, ComputeMode::Primary) {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.safekeepers")
|
||||
.ok_or("safekeeper connstrings should be provided")?
|
||||
.split(',')
|
||||
.map(|str| str.to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
} else {
|
||||
spec.safekeeper_connstrings.clone()
|
||||
};
|
||||
let storage_auth_token = spec.storage_auth_token.clone();
|
||||
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
|
||||
tenant_id
|
||||
@@ -138,6 +172,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
Ok(ParsedSpec {
|
||||
spec,
|
||||
pageserver_connstr,
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
@@ -319,6 +354,102 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_safekeepers_synced_async(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
) -> Result<Option<Lsn>> {
|
||||
// Construct a connection config for each safekeeper
|
||||
let pspec: ParsedSpec = compute_state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.clone();
|
||||
let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
|
||||
let sk_configs = sk_connstrs.into_iter().map(|connstr| {
|
||||
// Format connstr
|
||||
let id = connstr.clone();
|
||||
let connstr = format!("postgresql://no_user@{}", connstr);
|
||||
let options = format!(
|
||||
"-c timeline_id={} tenant_id={}",
|
||||
pspec.timeline_id, pspec.tenant_id
|
||||
);
|
||||
|
||||
// Construct client
|
||||
let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
|
||||
config.options(&options);
|
||||
if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
|
||||
config.password(storage_auth_token);
|
||||
}
|
||||
|
||||
(id, config)
|
||||
});
|
||||
|
||||
// Create task set to query all safekeepers
|
||||
let mut tasks = FuturesUnordered::new();
|
||||
let quorum = sk_configs.len() / 2 + 1;
|
||||
for (id, config) in sk_configs {
|
||||
let timeout = tokio::time::Duration::from_millis(100);
|
||||
let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
|
||||
tasks.push(tokio::spawn(task));
|
||||
}
|
||||
|
||||
// Get a quorum of responses or errors
|
||||
let mut responses = Vec::new();
|
||||
let mut join_errors = Vec::new();
|
||||
let mut task_errors = Vec::new();
|
||||
let mut timeout_errors = Vec::new();
|
||||
while let Some(response) = tasks.next().await {
|
||||
match response {
|
||||
Ok(Ok(Ok(r))) => responses.push(r),
|
||||
Ok(Ok(Err(e))) => task_errors.push(e),
|
||||
Ok(Err(e)) => timeout_errors.push(e),
|
||||
Err(e) => join_errors.push(e),
|
||||
};
|
||||
if responses.len() >= quorum {
|
||||
break;
|
||||
}
|
||||
if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// In case of error, log and fail the check, but don't crash.
|
||||
// We're playing it safe because these errors could be transient
|
||||
// and we don't yet retry. Also being careful here allows us to
|
||||
// be backwards compatible with safekeepers that don't have the
|
||||
// TIMELINE_STATUS API yet.
|
||||
if responses.len() < quorum {
|
||||
error!(
|
||||
"failed sync safekeepers check {:?} {:?} {:?}",
|
||||
join_errors, task_errors, timeout_errors
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(check_if_synced(responses))
|
||||
}
|
||||
|
||||
// Fast path for sync_safekeepers. If they're already synced we get the lsn
|
||||
// in one roundtrip. If not, we should do a full sync_safekeepers.
|
||||
pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
// Run actual work with new tokio runtime
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
|
||||
|
||||
// Record runtime
|
||||
self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
result
|
||||
}
|
||||
|
||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
// and return the reported LSN back to the caller.
|
||||
#[instrument(skip_all)]
|
||||
@@ -389,10 +520,14 @@ impl ComputeNode {
|
||||
// cannot sync safekeepers.
|
||||
let lsn = match spec.mode {
|
||||
ComputeMode::Primary => {
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("checking if safekeepers are synced");
|
||||
let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
|
||||
lsn
|
||||
} else {
|
||||
info!("starting safekeepers syncing");
|
||||
self.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?
|
||||
};
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
lsn
|
||||
}
|
||||
@@ -430,6 +565,50 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start and stop a postgres process to warm up the VM for startup.
|
||||
pub fn prewarm_postgres(&self) -> Result<()> {
|
||||
info!("prewarming");
|
||||
|
||||
// Create pgdata
|
||||
let pgdata = &format!("{}.warmup", self.pgdata);
|
||||
create_pgdata(pgdata)?;
|
||||
|
||||
// Run initdb to completion
|
||||
info!("running initdb");
|
||||
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
|
||||
Command::new(initdb_bin)
|
||||
.args(["-D", pgdata])
|
||||
.output()
|
||||
.expect("cannot start initdb process");
|
||||
|
||||
// Write conf
|
||||
use std::io::Write;
|
||||
let conf_path = Path::new(pgdata).join("postgresql.conf");
|
||||
let mut file = std::fs::File::create(conf_path)?;
|
||||
writeln!(file, "shared_buffers=65536")?;
|
||||
writeln!(file, "port=51055")?; // Nobody should be connecting
|
||||
writeln!(file, "shared_preload_libraries = 'neon'")?;
|
||||
|
||||
// Start postgres
|
||||
info!("starting postgres");
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(["-D", pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Stop it when it's ready
|
||||
info!("waiting for postgres");
|
||||
wait_for_postgres(&mut pg, Path::new(pgdata))?;
|
||||
pg.kill()?;
|
||||
info!("sent kill signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
|
||||
// clean up
|
||||
let _ok = fs::remove_dir_all(pgdata);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
#[instrument(skip_all)]
|
||||
@@ -569,7 +748,7 @@ impl ComputeNode {
|
||||
// remote shared_preload_libraries before postgres start (if any)
|
||||
{
|
||||
let library_load_start_time = Utc::now();
|
||||
self.prepare_preload_libraries(&compute_state)?;
|
||||
let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
|
||||
|
||||
let library_load_time = Utc::now()
|
||||
.signed_duration_since(library_load_start_time)
|
||||
@@ -577,11 +756,16 @@ impl ComputeNode {
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.metrics.load_libraries_ms = library_load_time;
|
||||
state.metrics.load_ext_ms = library_load_time;
|
||||
state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
|
||||
state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
|
||||
state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
|
||||
state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
|
||||
info!(
|
||||
"Loading shared_preload_libraries took {:?}ms",
|
||||
library_load_time
|
||||
);
|
||||
info!("{:?}", remote_ext_metrics);
|
||||
}
|
||||
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
@@ -735,46 +919,133 @@ LIMIT 100",
|
||||
|
||||
// If remote extension storage is configured,
|
||||
// download extension control files
|
||||
#[tokio::main]
|
||||
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let custom_ext_prefixes = spec.custom_extensions.clone().unwrap_or(Vec::new());
|
||||
info!("custom_ext_prefixes: {:?}", &custom_ext_prefixes);
|
||||
let available_extensions = extension_server::get_available_extensions(
|
||||
let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
|
||||
info!("custom extensions: {:?}", &custom_ext);
|
||||
let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&custom_ext_prefixes,
|
||||
&custom_ext,
|
||||
&self.build_tag,
|
||||
)
|
||||
.await?;
|
||||
self.available_extensions
|
||||
.set(available_extensions)
|
||||
.expect("available_extensions.set error");
|
||||
self.ext_remote_paths
|
||||
.set(ext_remote_paths)
|
||||
.expect("this is the only time we set ext_remote_paths");
|
||||
self.library_index
|
||||
.set(library_index)
|
||||
.expect("this is the only time we set library_index");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn download_extension(&self, ext_name: &str) -> Result<()> {
|
||||
// download an archive, unzip and place files in correct locations
|
||||
pub async fn download_extension(&self, ext_name: &str, is_library: bool) -> Result<u64> {
|
||||
match &self.ext_remote_storage {
|
||||
None => anyhow::bail!("No remote extension storage"),
|
||||
Some(remote_storage) => {
|
||||
extension_server::download_extension(
|
||||
ext_name,
|
||||
let mut real_ext_name = ext_name.to_string();
|
||||
if is_library {
|
||||
// sometimes library names might have a suffix like
|
||||
// library.so or library.so.3. We strip this off
|
||||
// because library_index is based on the name without the file extension
|
||||
let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
|
||||
let lib_raw_name = strip_lib_suffix.replace(&real_ext_name, "").to_string();
|
||||
real_ext_name = self
|
||||
.library_index
|
||||
.get()
|
||||
.expect("must have already downloaded the library_index")[&lib_raw_name]
|
||||
.clone();
|
||||
}
|
||||
|
||||
let ext_path = &self
|
||||
.ext_remote_paths
|
||||
.get()
|
||||
.expect("error accessing ext_remote_paths")[&real_ext_name];
|
||||
let ext_archive_name = ext_path.object_name().expect("bad path");
|
||||
|
||||
let mut first_try = false;
|
||||
if !self
|
||||
.ext_download_progress
|
||||
.read()
|
||||
.expect("lock err")
|
||||
.contains_key(ext_archive_name)
|
||||
{
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("lock err")
|
||||
.insert(ext_archive_name.to_string(), (Utc::now(), false));
|
||||
first_try = true;
|
||||
}
|
||||
let (download_start, download_completed) =
|
||||
self.ext_download_progress.read().expect("lock err")[ext_archive_name];
|
||||
let start_time_delta = Utc::now()
|
||||
.signed_duration_since(download_start)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
|
||||
// how long to wait for extension download if it was started by another process
|
||||
const HANG_TIMEOUT: u64 = 3000; // milliseconds
|
||||
|
||||
if download_completed {
|
||||
info!("extension already downloaded, skipping re-download");
|
||||
return Ok(0);
|
||||
} else if start_time_delta < HANG_TIMEOUT && !first_try {
|
||||
info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
|
||||
let mut interval =
|
||||
tokio::time::interval(tokio::time::Duration::from_millis(500));
|
||||
loop {
|
||||
info!("waiting for download");
|
||||
interval.tick().await;
|
||||
let (_, download_completed_now) =
|
||||
self.ext_download_progress.read().expect("lock")[ext_archive_name];
|
||||
if download_completed_now {
|
||||
info!("download finished by whoever else downloaded it");
|
||||
return Ok(0);
|
||||
}
|
||||
}
|
||||
// NOTE: the above loop will get terminated
|
||||
// based on the timeout of the download function
|
||||
}
|
||||
|
||||
// if extension hasn't been downloaded before or the previous
|
||||
// attempt to download was at least HANG_TIMEOUT ms ago
|
||||
// then we try to download it here
|
||||
info!("downloading new extension {ext_archive_name}");
|
||||
|
||||
let download_size = extension_server::download_extension(
|
||||
&real_ext_name,
|
||||
ext_path,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
)
|
||||
.await
|
||||
.await;
|
||||
self.ext_download_progress
|
||||
.write()
|
||||
.expect("bad lock")
|
||||
.insert(ext_archive_name.to_string(), (download_start, true));
|
||||
download_size
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
pub async fn prepare_preload_libraries(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
pub async fn prepare_preload_libraries(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
) -> Result<RemoteExtensionMetrics> {
|
||||
if self.ext_remote_storage.is_none() {
|
||||
return Ok(());
|
||||
return Ok(RemoteExtensionMetrics {
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
total_ext_download_size: 0,
|
||||
prep_extensions_ms: 0,
|
||||
});
|
||||
}
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
@@ -809,15 +1080,36 @@ LIMIT 100",
|
||||
libs_vec.extend(preload_libs_vec);
|
||||
}
|
||||
|
||||
info!("Download ext_index.json, find the extension paths");
|
||||
let prep_ext_start_time = Utc::now();
|
||||
self.prepare_external_extensions(compute_state).await?;
|
||||
let prep_ext_time_delta = Utc::now()
|
||||
.signed_duration_since(prep_ext_start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
info!("Prepare extensions took {prep_ext_time_delta}ms");
|
||||
|
||||
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
|
||||
let mut download_tasks = Vec::new();
|
||||
for library in &libs_vec {
|
||||
download_tasks.push(self.download_extension(library));
|
||||
download_tasks.push(self.download_extension(library, true));
|
||||
}
|
||||
let results = join_all(download_tasks).await;
|
||||
|
||||
let mut remote_ext_metrics = RemoteExtensionMetrics {
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
total_ext_download_size: 0,
|
||||
prep_extensions_ms: prep_ext_time_delta,
|
||||
};
|
||||
for result in results {
|
||||
result?; // propogate any errors
|
||||
let download_size = result?;
|
||||
remote_ext_metrics.num_ext_downloaded += 1;
|
||||
remote_ext_metrics.largest_ext_size =
|
||||
std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
|
||||
remote_ext_metrics.total_ext_download_size += download_size;
|
||||
}
|
||||
Ok(())
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Result;
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
@@ -42,9 +41,7 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn launch_configurator(
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let compute = Arc::clone(compute);
|
||||
|
||||
thread::Builder::new()
|
||||
@@ -53,4 +50,5 @@ pub fn launch_configurator(
|
||||
configurator_main_loop(&compute);
|
||||
info!("configurator thread is exited");
|
||||
})
|
||||
.expect("cannot launch configurator thread")
|
||||
}
|
||||
|
||||
@@ -1,35 +1,91 @@
|
||||
// Download extension files from the extension store
|
||||
// and put them in the right place in the postgres directory
|
||||
// and put them in the right place in the postgres directory (share / lib)
|
||||
/*
|
||||
The layout of the S3 bucket is as follows:
|
||||
5615610098 // this is an extension build number
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ ├── anon.tar.zst
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ ├── anon.tar.zst
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
5615261079
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── anon.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── anon.tar.zst
|
||||
└── ext_index.json
|
||||
5623261088
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
|
||||
v14/ext_index.json
|
||||
-- this contains information necessary to create control files
|
||||
v14/extensions/test_ext1.tar.gz
|
||||
-- this contains the library files and sql files necessary to create this extension
|
||||
v14/extensions/custom_ext1.tar.gz
|
||||
Note that build number cannot be part of prefix because we might need extensions
|
||||
from other build numbers.
|
||||
|
||||
The difference between a private and public extensions is determined by who can
|
||||
load the extension this is specified in ext_index.json
|
||||
ext_index.json stores the control files and location of extension archives
|
||||
It also stores a list of public extensions and a library_index
|
||||
|
||||
Speicially, ext_index.json has a list of public extensions, and a list of
|
||||
extensions enabled for specific tenant-ids.
|
||||
We don't need to duplicate extension.tar.zst files.
|
||||
We only need to upload a new one if it is updated.
|
||||
(Although currently we just upload every time anyways, hopefully will change
|
||||
this sometime)
|
||||
|
||||
*access* is controlled by spec
|
||||
|
||||
More specifically, here is an example ext_index.json
|
||||
{
|
||||
"public_extensions": [
|
||||
"anon",
|
||||
"pg_buffercache"
|
||||
],
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
"pg_buffercache": "pg_buffercache"
|
||||
},
|
||||
"extension_data": {
|
||||
"pg_buffercache": {
|
||||
"control_data": {
|
||||
"pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
|
||||
},
|
||||
"anon": {
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/anon.tar.zst"
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
use crate::compute::ComputeNode;
|
||||
use anyhow::Context;
|
||||
use anyhow::{self, Result};
|
||||
use flate2::read::GzDecoder;
|
||||
use futures::future::join_all;
|
||||
use remote_storage::*;
|
||||
use serde_json::{self, Value};
|
||||
use std::collections::HashSet;
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Read;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use tar::Archive;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::info;
|
||||
use tracing::log::warn;
|
||||
use zstd::stream::read::Decoder;
|
||||
|
||||
fn get_pg_config(argument: &str, pgbin: &str) -> String {
|
||||
// gives the result of `pg_config [argument]`
|
||||
@@ -61,154 +117,152 @@ pub fn get_pg_version(pgbin: &str) -> String {
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
// download extension control files
|
||||
// if custom_ext_prefixes is provided - search also in custom extension paths
|
||||
// download control files for enabled_extensions
|
||||
// return Hashmaps converting library names to extension names (library_index)
|
||||
// and specifying the remote path to the archive for each extension name
|
||||
pub async fn get_available_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
custom_ext_prefixes: &[String],
|
||||
) -> Result<HashSet<String>> {
|
||||
custom_extensions: &[String],
|
||||
build_tag: &str,
|
||||
) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let index_path = pg_version.to_owned() + "/ext_index.json";
|
||||
let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
|
||||
let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
|
||||
info!("download ext_index.json: {:?}", &index_path);
|
||||
info!("download ext_index.json from: {:?}", &index_path);
|
||||
|
||||
// TODO: potential optimization: cache ext_index.json
|
||||
let mut download = remote_storage.download(&index_path).await?;
|
||||
let mut write_data_buffer = Vec::new();
|
||||
let mut ext_idx_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut write_data_buffer)
|
||||
.read_to_end(&mut ext_idx_buffer)
|
||||
.await?;
|
||||
let ext_index_str = match str::from_utf8(&write_data_buffer) {
|
||||
Ok(v) => v,
|
||||
Err(e) => panic!("Invalid UTF-8 sequence: {}", e),
|
||||
};
|
||||
info!("ext_index downloaded");
|
||||
|
||||
let ext_index_full: Value = serde_json::from_str(ext_index_str)?;
|
||||
let ext_index_full = ext_index_full.as_object().context("error parsing json")?;
|
||||
let control_data = ext_index_full["control_data"]
|
||||
.as_object()
|
||||
.context("json parse error")?;
|
||||
let enabled_extensions = ext_index_full["enabled_extensions"]
|
||||
.as_object()
|
||||
.context("json parse error")?;
|
||||
info!("{:?}", control_data.clone());
|
||||
info!("{:?}", enabled_extensions.clone());
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct Index {
|
||||
public_extensions: Vec<String>,
|
||||
library_index: HashMap<String, String>,
|
||||
extension_data: HashMap<String, ExtensionData>,
|
||||
}
|
||||
|
||||
let mut prefixes = vec!["public".to_string()];
|
||||
prefixes.extend(custom_ext_prefixes.to_owned());
|
||||
info!("{:?}", &prefixes);
|
||||
let mut all_extensions = HashSet::new();
|
||||
for prefix in prefixes {
|
||||
let prefix_extensions = match enabled_extensions.get(&prefix) {
|
||||
Some(Value::Array(ext_name)) => ext_name,
|
||||
_ => {
|
||||
info!("prefix {} has no extensions", prefix);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
info!("{:?}", prefix_extensions);
|
||||
for ext_name in prefix_extensions {
|
||||
all_extensions.insert(ext_name.as_str().context("json parse error")?.to_string());
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct ExtensionData {
|
||||
control_data: HashMap<String, String>,
|
||||
archive_path: String,
|
||||
}
|
||||
|
||||
let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
|
||||
let mut enabled_extensions = ext_index_full.public_extensions;
|
||||
enabled_extensions.extend_from_slice(custom_extensions);
|
||||
let library_index = ext_index_full.library_index;
|
||||
let all_extension_data = ext_index_full.extension_data;
|
||||
info!("library_index: {:?}", library_index);
|
||||
|
||||
info!("enabled_extensions: {:?}", enabled_extensions);
|
||||
let mut ext_remote_paths = HashMap::new();
|
||||
let mut file_create_tasks = Vec::new();
|
||||
for extension in enabled_extensions {
|
||||
let ext_data = &all_extension_data[&extension];
|
||||
for (control_file, control_contents) in &ext_data.control_data {
|
||||
let extension_name = control_file
|
||||
.strip_suffix(".control")
|
||||
.expect("control files must end in .control");
|
||||
ext_remote_paths.insert(
|
||||
extension_name.to_string(),
|
||||
RemotePath::from_string(&ext_data.archive_path)?,
|
||||
);
|
||||
let control_path = local_sharedir.join(control_file);
|
||||
info!("writing file {:?}{:?}", control_path, control_contents);
|
||||
file_create_tasks.push(tokio::fs::write(control_path, control_contents));
|
||||
}
|
||||
}
|
||||
|
||||
for prefix in &all_extensions {
|
||||
let control_contents = control_data[prefix].as_str().context("json parse error")?;
|
||||
let control_path = local_sharedir.join(prefix.to_owned() + ".control");
|
||||
|
||||
info!("WRITING FILE {:?}{:?}", control_path, control_contents);
|
||||
std::fs::write(control_path, control_contents)?;
|
||||
let results = join_all(file_create_tasks).await;
|
||||
for result in results {
|
||||
result?;
|
||||
}
|
||||
|
||||
Ok(all_extensions.into_iter().collect())
|
||||
info!("ext_remote_paths {:?}", ext_remote_paths);
|
||||
Ok((ext_remote_paths, library_index))
|
||||
}
|
||||
|
||||
// download all sqlfiles (and possibly data files) for a given extension name
|
||||
// download the archive for a given extension,
|
||||
// unzip it, and place files in the appropriate locations (share/lib)
|
||||
pub async fn download_extension(
|
||||
ext_name: &str,
|
||||
ext_path: &RemotePath,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
) -> Result<()> {
|
||||
// TODO: potential optimization: only download the extension if it doesn't exist
|
||||
// problem: how would we tell if it exists?
|
||||
let ext_name = ext_name.replace(".so", "");
|
||||
let ext_name_targz = ext_name.to_owned() + ".tar.gz";
|
||||
if Path::new(&ext_name_targz).exists() {
|
||||
info!("extension {:?} already exists", ext_name_targz);
|
||||
return Ok(());
|
||||
}
|
||||
let ext_path = RemotePath::new(
|
||||
&Path::new(pg_version)
|
||||
.join("extensions")
|
||||
.join(ext_name_targz.clone()),
|
||||
)?;
|
||||
info!(
|
||||
"Start downloading extension {:?} from {:?}",
|
||||
ext_name, ext_path
|
||||
);
|
||||
let mut download = remote_storage.download(&ext_path).await?;
|
||||
let mut write_data_buffer = Vec::new();
|
||||
) -> Result<u64> {
|
||||
info!("Download extension {:?} from {:?}", ext_name, ext_path);
|
||||
let mut download = remote_storage.download(ext_path).await?;
|
||||
let mut download_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut write_data_buffer)
|
||||
.read_to_end(&mut download_buffer)
|
||||
.await?;
|
||||
let unzip_dest = pgbin.strip_suffix("/bin/postgres").expect("bad pgbin");
|
||||
let tar = GzDecoder::new(write_data_buffer.as_slice());
|
||||
let mut archive = Archive::new(tar);
|
||||
archive.unpack(unzip_dest)?;
|
||||
let download_size = download_buffer.len() as u64;
|
||||
// it's unclear whether it is more performant to decompress into memory or not
|
||||
// TODO: decompressing into memory can be avoided
|
||||
let mut decoder = Decoder::new(download_buffer.as_slice())?;
|
||||
let mut decompress_buffer = Vec::new();
|
||||
decoder.read_to_end(&mut decompress_buffer)?;
|
||||
let mut archive = Archive::new(decompress_buffer.as_slice());
|
||||
let unzip_dest = pgbin
|
||||
.strip_suffix("/bin/postgres")
|
||||
.expect("bad pgbin")
|
||||
.to_string()
|
||||
+ "/download_extensions";
|
||||
archive.unpack(&unzip_dest)?;
|
||||
info!("Download + unzip {:?} completed successfully", &ext_path);
|
||||
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let zip_sharedir = format!("{unzip_dest}/extensions/{ext_name}/share/extension");
|
||||
info!("mv {zip_sharedir:?}/* {local_sharedir:?}");
|
||||
for file in std::fs::read_dir(zip_sharedir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&local_sharedir).join(old_file.file_name().context("error parsing file")?);
|
||||
std::fs::rename(old_file, new_file)?;
|
||||
let sharedir_paths = (
|
||||
unzip_dest.to_string() + "/share/extension",
|
||||
Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
|
||||
);
|
||||
let libdir_paths = (
|
||||
unzip_dest.to_string() + "/lib",
|
||||
Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql"),
|
||||
);
|
||||
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
|
||||
for paths in [sharedir_paths, libdir_paths] {
|
||||
let (zip_dir, real_dir) = paths;
|
||||
info!("mv {zip_dir:?}/* {real_dir:?}");
|
||||
for file in std::fs::read_dir(zip_dir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
|
||||
info!("moving {old_file:?} to {new_file:?}");
|
||||
|
||||
// extension download failed: Directory not empty (os error 39)
|
||||
match std::fs::rename(old_file, new_file) {
|
||||
Ok(()) => info!("move succeeded"),
|
||||
Err(e) => {
|
||||
warn!("move failed, probably because the extension already exists: {e}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let local_libdir = Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql");
|
||||
let zip_libdir = format!("{unzip_dest}/extensions/{ext_name}/lib");
|
||||
info!("mv {zip_libdir:?}/* {local_libdir:?}");
|
||||
for file in std::fs::read_dir(zip_libdir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&local_libdir).join(old_file.file_name().context("error parsing file")?);
|
||||
std::fs::rename(old_file, new_file)?;
|
||||
}
|
||||
Ok(())
|
||||
info!("done moving extension {ext_name}");
|
||||
Ok(download_size)
|
||||
}
|
||||
|
||||
// This function initializes the necessary structs to use remmote storage (should be fairly cheap)
|
||||
pub fn init_remote_storage(
|
||||
remote_ext_config: &str,
|
||||
default_prefix: &str,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
|
||||
// This function initializes the necessary structs to use remote storage
|
||||
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct RemoteExtJson {
|
||||
bucket: String,
|
||||
region: String,
|
||||
endpoint: Option<String>,
|
||||
prefix: Option<String>,
|
||||
}
|
||||
let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
|
||||
|
||||
let remote_ext_bucket = remote_ext_config["bucket"]
|
||||
.as_str()
|
||||
.context("config parse error")?;
|
||||
let remote_ext_region = remote_ext_config["region"]
|
||||
.as_str()
|
||||
.context("config parse error")?;
|
||||
let remote_ext_endpoint = remote_ext_config["endpoint"].as_str();
|
||||
let remote_ext_prefix = remote_ext_config["prefix"]
|
||||
.as_str()
|
||||
.unwrap_or(default_prefix)
|
||||
.to_string();
|
||||
|
||||
// TODO: potentially allow modification of other parameters
|
||||
// however, default values should be fine for now
|
||||
let config = S3Config {
|
||||
bucket_name: remote_ext_bucket.to_string(),
|
||||
bucket_region: remote_ext_region.to_string(),
|
||||
prefix_in_bucket: Some(remote_ext_prefix),
|
||||
endpoint: remote_ext_endpoint.map(|x| x.to_string()),
|
||||
bucket_name: remote_ext_json.bucket,
|
||||
bucket_region: remote_ext_json.region,
|
||||
prefix_in_bucket: remote_ext_json.prefix,
|
||||
endpoint: remote_ext_json.endpoint,
|
||||
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_keys_per_list_response: None,
|
||||
};
|
||||
@@ -219,19 +273,3 @@ pub fn init_remote_storage(
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config)
|
||||
}
|
||||
|
||||
pub fn launch_download_extensions(
|
||||
compute: &Arc<ComputeNode>,
|
||||
) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
let compute = Arc::clone(compute);
|
||||
thread::Builder::new()
|
||||
.name("download-extensions".into())
|
||||
.spawn(move || {
|
||||
info!("start download_extension_files");
|
||||
let compute_state = compute.state.lock().expect("error unlocking compute.state");
|
||||
compute
|
||||
.prepare_external_extensions(&compute_state)
|
||||
.expect("error preparing extensions");
|
||||
info!("download_extension_files done, exiting thread");
|
||||
})
|
||||
}
|
||||
|
||||
@@ -125,13 +125,23 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
info!(
|
||||
"serving /extension_server POST request, filename: {:?}",
|
||||
&filename
|
||||
);
|
||||
|
||||
match compute.download_extension(&filename).await {
|
||||
let mut is_library = false;
|
||||
if let Some(params) = req.uri().query() {
|
||||
info!("serving {:?} POST request with params: {}", route, params);
|
||||
if params == "is_library=true" {
|
||||
is_library = true;
|
||||
} else {
|
||||
let mut resp = Response::new(Body::from("Wrong request parameters"));
|
||||
*resp.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return resp;
|
||||
}
|
||||
}
|
||||
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
|
||||
|
||||
match compute.download_extension(&filename, is_library).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
|
||||
@@ -14,3 +14,4 @@ pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod spec;
|
||||
pub mod sync_sk;
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
use std::{thread, time};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, info};
|
||||
@@ -105,10 +104,11 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>, std::io::Error> {
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
thread::Builder::new()
|
||||
.name("compute-monitor".into())
|
||||
.spawn(move || watch_compute_activity(&state))
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // mil
|
||||
/// Escape a string for including it in a SQL literal. Wrapping the result
|
||||
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
||||
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// See https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47
|
||||
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
||||
/// for the original implementation.
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
let res = s.replace('\'', "''").replace('\\', "\\\\");
|
||||
|
||||
98
compute_tools/src/sync_sk.rs
Normal file
98
compute_tools/src/sync_sk.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
// Utils for running sync_safekeepers
|
||||
use anyhow::Result;
|
||||
use tracing::info;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum TimelineStatusResponse {
|
||||
NotFound,
|
||||
Ok(TimelineStatusOkResponse),
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct TimelineStatusOkResponse {
|
||||
flush_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Get a safekeeper's metadata for our timeline. The id is only used for logging
|
||||
pub async fn ping_safekeeper(
|
||||
id: String,
|
||||
config: tokio_postgres::Config,
|
||||
) -> Result<TimelineStatusResponse> {
|
||||
// TODO add retries
|
||||
|
||||
// Connect
|
||||
info!("connecting to {}", id);
|
||||
let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = conn.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
// Query
|
||||
info!("querying {}", id);
|
||||
let result = client.simple_query("TIMELINE_STATUS").await?;
|
||||
|
||||
// Parse result
|
||||
info!("done with {}", id);
|
||||
if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
|
||||
use std::str::FromStr;
|
||||
let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
|
||||
flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
|
||||
commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
|
||||
});
|
||||
Ok(response)
|
||||
} else {
|
||||
// Timeline doesn't exist
|
||||
Ok(TimelineStatusResponse::NotFound)
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a quorum of responses, check if safekeepers are synced at some Lsn
|
||||
pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
|
||||
// Check if all responses are ok
|
||||
let ok_responses: Vec<TimelineStatusOkResponse> = responses
|
||||
.iter()
|
||||
.filter_map(|r| match r {
|
||||
TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
|
||||
_ => None,
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
if ok_responses.len() < responses.len() {
|
||||
info!(
|
||||
"not synced. Only {} out of {} know about this timeline",
|
||||
ok_responses.len(),
|
||||
responses.len()
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
// Get the min and the max of everything
|
||||
let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
|
||||
let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
|
||||
let commit_max = commit.iter().max().unwrap();
|
||||
let commit_min = commit.iter().min().unwrap();
|
||||
let flush_max = flush.iter().max().unwrap();
|
||||
let flush_min = flush.iter().min().unwrap();
|
||||
|
||||
// Check that all values are equal
|
||||
if commit_min != commit_max {
|
||||
info!("not synced. {:?} {:?}", commit_min, commit_max);
|
||||
return None;
|
||||
}
|
||||
if flush_min != flush_max {
|
||||
info!("not synced. {:?} {:?}", flush_min, flush_max);
|
||||
return None;
|
||||
}
|
||||
|
||||
// Check that commit == flush
|
||||
if commit_max != flush_max {
|
||||
info!("not synced. {:?} {:?}", commit_max, flush_max);
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(*commit_max)
|
||||
}
|
||||
@@ -10,7 +10,7 @@
|
||||
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
|
||||
//! The pid stored in the file is later used to stop the service.
|
||||
//!
|
||||
//! See [`lock_file`] module for more info.
|
||||
//! See the [`lock_file`](utils::lock_file) module for more info.
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//!
|
||||
//! ```
|
||||
use anyhow::Context;
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each endpoint is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/endpoints/<endpoint id>
|
||||
//! ```
|
||||
//!
|
||||
//! Some basic information about the endpoint, like the tenant and timeline IDs,
|
||||
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
|
||||
@@ -22,7 +24,7 @@
|
||||
//!
|
||||
//! Directory contents:
|
||||
//!
|
||||
//! ```ignore
|
||||
//! ```text
|
||||
//! .neon/endpoints/main/
|
||||
//! compute.log - log output of `compute_ctl` and `postgres`
|
||||
//! endpoint.json - serialized `EndpointConf` struct
|
||||
@@ -287,7 +289,7 @@ impl Endpoint {
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.map(|sk| format!("localhost:{}", sk.get_compute_port()))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("neon.safekeepers", &safekeepers);
|
||||
@@ -316,7 +318,7 @@ impl Endpoint {
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|x| x.pg_port.to_string())
|
||||
.map(|x| x.get_compute_port().to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
|
||||
@@ -466,7 +468,7 @@ impl Endpoint {
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -491,15 +493,7 @@ impl Endpoint {
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
// TODO FIXME: This is a hack to test custom extensions locally.
|
||||
// In test_download_extensions, we assume that the custom extension
|
||||
// prefix is the tenant ID. So we set it here.
|
||||
//
|
||||
// The proper way to implement this is to pass the custom extension
|
||||
// in spec, but we don't have a way to do that yet in the python tests.
|
||||
// NEW HACK: we enable the anon custom extension for everyone! this is of course just for testing
|
||||
// how will we do it for real?
|
||||
custom_extensions: Some(vec!["123454321".to_string(), self.tenant_id.to_string()]),
|
||||
custom_extensions: Some(vec![]),
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -581,9 +575,7 @@ impl Endpoint {
|
||||
}
|
||||
Err(e) => {
|
||||
if attempt == MAX_ATTEMPTS {
|
||||
return Err(e).context(
|
||||
"timed out waiting to connect to compute_ctl HTTP; last error: {e}",
|
||||
);
|
||||
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,6 +137,7 @@ impl Default for PageServerConf {
|
||||
pub struct SafekeeperConf {
|
||||
pub id: NodeId,
|
||||
pub pg_port: u16,
|
||||
pub pg_tenant_only_port: Option<u16>,
|
||||
pub http_port: u16,
|
||||
pub sync: bool,
|
||||
pub remote_storage: Option<String>,
|
||||
@@ -149,6 +150,7 @@ impl Default for SafekeeperConf {
|
||||
Self {
|
||||
id: NodeId(0),
|
||||
pg_port: 0,
|
||||
pg_tenant_only_port: None,
|
||||
http_port: 0,
|
||||
sync: true,
|
||||
remote_storage: None,
|
||||
@@ -158,6 +160,14 @@ impl Default for SafekeeperConf {
|
||||
}
|
||||
}
|
||||
|
||||
impl SafekeeperConf {
|
||||
/// Compute is served by port on which only tenant scoped tokens allowed, if
|
||||
/// it is configured.
|
||||
pub fn get_compute_port(&self) -> u16 {
|
||||
self.pg_tenant_only_port.unwrap_or(self.pg_port)
|
||||
}
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
pub fn pg_distrib_dir_raw(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.clone()
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
//!
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//!
|
||||
//! ```
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Child;
|
||||
@@ -119,45 +120,55 @@ impl SafekeeperNode {
|
||||
let availability_zone = format!("sk-{}", id_string);
|
||||
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
"--id",
|
||||
&id_string,
|
||||
"--listen-pg",
|
||||
&listen_pg,
|
||||
"--listen-http",
|
||||
&listen_http,
|
||||
"--availability-zone",
|
||||
&availability_zone,
|
||||
"-D".to_owned(),
|
||||
datadir
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned(),
|
||||
"--id".to_owned(),
|
||||
id_string,
|
||||
"--listen-pg".to_owned(),
|
||||
listen_pg,
|
||||
"--listen-http".to_owned(),
|
||||
listen_http,
|
||||
"--availability-zone".to_owned(),
|
||||
availability_zone,
|
||||
];
|
||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
|
||||
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
||||
}
|
||||
if !self.conf.sync {
|
||||
args.push("--no-sync");
|
||||
args.push("--no-sync".to_owned());
|
||||
}
|
||||
|
||||
let broker_endpoint = format!("{}", self.env.broker.client_url());
|
||||
args.extend(["--broker-endpoint", &broker_endpoint]);
|
||||
args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);
|
||||
|
||||
let mut backup_threads = String::new();
|
||||
if let Some(threads) = self.conf.backup_threads {
|
||||
backup_threads = threads.to_string();
|
||||
args.extend(["--backup-threads", &backup_threads]);
|
||||
args.extend(["--backup-threads".to_owned(), backup_threads]);
|
||||
} else {
|
||||
drop(backup_threads);
|
||||
}
|
||||
|
||||
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||
args.extend(["--remote-storage", remote_storage]);
|
||||
args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
|
||||
}
|
||||
|
||||
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
|
||||
if self.conf.auth_enabled {
|
||||
args.extend([
|
||||
"--auth-validation-public-key-path",
|
||||
key_path.to_str().with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
"--auth-validation-public-key-path".to_owned(),
|
||||
key_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned(),
|
||||
]);
|
||||
}
|
||||
|
||||
|
||||
@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.
|
||||
|
||||
In async Rust, futures can be "cancelled" at any await point, by
|
||||
dropping the Future. For example, `tokio::select!` returns as soon as
|
||||
one of the Futures returns, and drops the others. `tokio::timeout!` is
|
||||
another example. In the Rust ecosystem, some functions are
|
||||
one of the Futures returns, and drops the others. `tokio::time::timeout`
|
||||
is another example. In the Rust ecosystem, some functions are
|
||||
cancellation-safe, meaning they can be safely dropped without
|
||||
side-effects, while others are not. See documentation of
|
||||
`tokio::select!` for examples.
|
||||
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
|
||||
cancellation-safe, and must be polled to completion.
|
||||
|
||||
The downside of non-cancellation safe code is that you have to be very
|
||||
careful when using `tokio::select!`, `tokio::timeout!`, and other such
|
||||
functions that can cause a Future to be dropped. They can only be used
|
||||
with functions that are explicitly documented to be cancellation-safe,
|
||||
careful when using `tokio::select!`, `tokio::time::timeout`, and other
|
||||
such functions that can cause a Future to be dropped. They can only be
|
||||
used with functions that are explicitly documented to be cancellation-safe,
|
||||
or you need to spawn a separate task to shield from the cancellation.
|
||||
|
||||
At the entry points to the code, we also take care to poll futures to
|
||||
|
||||
@@ -141,37 +141,86 @@ popular extensions.
|
||||
|
||||
The layout of the S3 bucket is as follows:
|
||||
```
|
||||
v14/ext_index.json
|
||||
-- this contains information necessary to create control files
|
||||
v14/extensions/test_ext1.tar.gz
|
||||
-- this contains the library files and sql files necessary to create this extension
|
||||
v14/extensions/custom_ext1.tar.gz
|
||||
5615610098 // this is an extension build number
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ ├── anon.tar.zst
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ ├── anon.tar.zst
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
5615261079
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── anon.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── anon.tar.zst
|
||||
└── ext_index.json
|
||||
5623261088
|
||||
├── v14
|
||||
│ ├── extensions
|
||||
│ │ └── embedding.tar.zst
|
||||
│ └── ext_index.json
|
||||
└── v15
|
||||
├── extensions
|
||||
│ └── embedding.tar.zst
|
||||
└── ext_index.json
|
||||
```
|
||||
The difference between private and public extensions is determined by who can
|
||||
load the extension. This is specified in `ext_index.json`.
|
||||
Speicially, `ext_index.json` has a list of public extensions, and a list of
|
||||
extensions enabled for specific tenant-ids. Here is an example `ext_index.json`:
|
||||
|
||||
Note that build number cannot be part of prefix because we might need extensions
|
||||
from other build numbers.
|
||||
|
||||
`ext_index.json` stores the control files and location of extension archives.
|
||||
It also stores a list of public extensions and a library_index
|
||||
|
||||
We don't need to duplicate `extension.tar.zst`` files.
|
||||
We only need to upload a new one if it is updated.
|
||||
(Although currently we just upload every time anyways, hopefully will change
|
||||
this sometime)
|
||||
|
||||
*access* is controlled by spec
|
||||
|
||||
More specifically, here is an example ext_index.json
|
||||
```
|
||||
{
|
||||
"enabled_extensions": {
|
||||
"123454321": [
|
||||
"anon"
|
||||
"public_extensions": [
|
||||
"anon",
|
||||
"pg_buffercache"
|
||||
],
|
||||
"public": [
|
||||
"embedding"
|
||||
]
|
||||
},
|
||||
"control_data": {
|
||||
"embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true",
|
||||
"anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
}
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
"pg_buffercache": "pg_buffercache"
|
||||
// for more complex extensions like postgis
|
||||
// we might have something like:
|
||||
// address_standardizer: postgis
|
||||
// postgis_tiger: postgis
|
||||
},
|
||||
"extension_data": {
|
||||
"pg_buffercache": {
|
||||
"control_data": {
|
||||
"pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
|
||||
},
|
||||
"anon": {
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
|
||||
},
|
||||
"archive_path": "5670669815/v14/extensions/anon.tar.zst"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### How to add new extension to the Extension Storage?
|
||||
|
||||
Simply upload build artifacts to the S3 bucket.
|
||||
Implement a CI step for that. Splitting it from ompute-node-image build.
|
||||
Implement a CI step for that. Splitting it from compute-node-image build.
|
||||
|
||||
### How do we deal with extension versions and updates?
|
||||
|
||||
|
||||
22
docs/tools.md
Normal file
22
docs/tools.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Useful development tools
|
||||
|
||||
This readme contains some hints on how to set up some optional development tools.
|
||||
|
||||
## ccls
|
||||
|
||||
[ccls](https://github.com/MaskRay/ccls) is a c/c++ language server. It requires some setup
|
||||
to work well. There are different ways to do it but here's what works for me:
|
||||
1. Make a common parent directory for all your common neon projects. (for example, `~/src/neondatabase/`)
|
||||
2. Go to `vendor/postgres-v15`
|
||||
3. Run `make clean && ./configure`
|
||||
4. Install [bear](https://github.com/rizsotto/Bear), and run `bear -- make -j4`
|
||||
5. Copy the generated `compile_commands.json` to `~/src/neondatabase` (or equivalent)
|
||||
6. Run `touch ~/src/neondatabase/.ccls-root` this will make the `compile_commands.json` file discoverable in all subdirectories
|
||||
|
||||
With this setup you will get decent lsp mileage inside the postgres repo, and also any postgres extensions that you put in `~/src/neondatabase/`, like `pg_embedding`, or inside `~/src/neondatabase/neon/pgxn` as well.
|
||||
|
||||
Some additional tips for various IDEs:
|
||||
|
||||
### Emacs
|
||||
|
||||
To improve performance: `(setq lsp-lens-enable nil)`
|
||||
@@ -70,12 +70,17 @@ where
|
||||
pub struct ComputeMetrics {
|
||||
pub wait_for_spec_ms: u64,
|
||||
pub sync_safekeepers_ms: u64,
|
||||
pub sync_sk_check_ms: u64,
|
||||
pub basebackup_ms: u64,
|
||||
pub basebackup_bytes: u64,
|
||||
pub start_postgres_ms: u64,
|
||||
pub config_ms: u64,
|
||||
pub total_startup_ms: u64,
|
||||
pub load_libraries_ms: u64,
|
||||
pub load_ext_ms: u64,
|
||||
pub num_ext_downloaded: u64,
|
||||
pub largest_ext_size: u64, // these are measured in bytes
|
||||
pub total_ext_download_size: u64,
|
||||
pub prep_extensions_ms: u64,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
|
||||
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
|
||||
use rand::Rng;
|
||||
use serde::Serialize;
|
||||
|
||||
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum EventType {
|
||||
#[serde(rename = "absolute")]
|
||||
@@ -17,6 +17,32 @@ pub enum EventType {
|
||||
},
|
||||
}
|
||||
|
||||
impl EventType {
|
||||
pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
|
||||
use EventType::*;
|
||||
match self {
|
||||
Absolute { time } => Some(time),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
|
||||
// these can most likely be thought of as Range or RangeFull
|
||||
use EventType::*;
|
||||
match self {
|
||||
Incremental {
|
||||
start_time,
|
||||
stop_time,
|
||||
} => Some(start_time..stop_time),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_incremental(&self) -> bool {
|
||||
matches!(self, EventType::Incremental { .. })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
pub struct Event<Extra> {
|
||||
#[serde(flatten)]
|
||||
@@ -31,7 +57,7 @@ pub struct Event<Extra> {
|
||||
pub extra: Extra,
|
||||
}
|
||||
|
||||
pub fn idempotency_key(node_id: String) -> String {
|
||||
pub fn idempotency_key(node_id: &str) -> String {
|
||||
format!(
|
||||
"{}-{}-{:04}",
|
||||
Utc::now(),
|
||||
@@ -45,6 +71,6 @@ pub const CHUNK_SIZE: usize = 1000;
|
||||
// Just a wrapper around a slice of events
|
||||
// to serialize it as `{"events" : [ ] }
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct EventChunk<'a, T> {
|
||||
pub events: &'a [T],
|
||||
pub struct EventChunk<'a, T: Clone> {
|
||||
pub events: std::borrow::Cow<'a, [T]>,
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ use once_cell::sync::Lazy;
|
||||
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
||||
pub use prometheus::opts;
|
||||
pub use prometheus::register;
|
||||
pub use prometheus::Error;
|
||||
pub use prometheus::{core, default_registry, proto};
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_counter_vec, Counter, CounterVec};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
|
||||
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use strum_macros;
|
||||
use utils::{
|
||||
completion,
|
||||
history_buffer::HistoryBufferWithDropCounter,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -76,7 +77,12 @@ pub enum TenantState {
|
||||
/// system is being shut down.
|
||||
///
|
||||
/// Transitions out of this state are possible through `set_broken()`.
|
||||
Stopping,
|
||||
Stopping {
|
||||
// Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
|
||||
// otherwise it will not be skipped during deserialization
|
||||
#[serde(skip)]
|
||||
progress: completion::Barrier,
|
||||
},
|
||||
/// The tenant is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations.
|
||||
///
|
||||
@@ -118,7 +124,7 @@ impl TenantState {
|
||||
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
|
||||
// we set the Stopping state irrespective of whether the tenant
|
||||
// has finished attaching or not.
|
||||
Self::Stopping => Maybe,
|
||||
Self::Stopping { .. } => Maybe,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -411,12 +417,16 @@ pub struct LayerResidenceEvent {
|
||||
pub reason: LayerResidenceEventReason,
|
||||
}
|
||||
|
||||
/// The reason for recording a given [`ResidenceEvent`].
|
||||
/// The reason for recording a given [`LayerResidenceEvent`].
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub enum LayerResidenceEventReason {
|
||||
/// The layer map is being populated, e.g. during timeline load or attach.
|
||||
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
|
||||
/// We need to record such events because there is no persistent storage for the events.
|
||||
///
|
||||
// https://github.com/rust-lang/rust/issues/74481
|
||||
/// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
|
||||
/// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
|
||||
LayerLoad,
|
||||
/// We just created the layer (e.g., freeze_and_flush or compaction).
|
||||
/// Such layers are always [`LayerResidenceStatus::Resident`].
|
||||
@@ -924,7 +934,13 @@ mod tests {
|
||||
"Activating",
|
||||
),
|
||||
(line!(), TenantState::Active, "Active"),
|
||||
(line!(), TenantState::Stopping, "Stopping"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Stopping {
|
||||
progress: utils::completion::Barrier::default(),
|
||||
},
|
||||
"Stopping",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Broken {
|
||||
|
||||
@@ -60,8 +60,9 @@ impl Ord for RelTag {
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// ```text
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
/// ```
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
|
||||
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
|
||||
// Multixact utils
|
||||
|
||||
pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
|
||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
|
||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
|
||||
}
|
||||
|
||||
pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
|
||||
@@ -81,3 +81,41 @@ fn mx_offset_to_member_page(xid: u32) -> u32 {
|
||||
pub fn mx_offset_to_member_segment(xid: u32) -> i32 {
|
||||
(mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_multixid_calc() {
|
||||
// Check that the mx_offset_* functions produce the same values as the
|
||||
// corresponding PostgreSQL C macros (MXOffsetTo*). These test values
|
||||
// were generated by calling the PostgreSQL macros with a little C
|
||||
// program.
|
||||
assert_eq!(mx_offset_to_member_segment(0), 0);
|
||||
assert_eq!(mx_offset_to_member_page(0), 0);
|
||||
assert_eq!(mx_offset_to_flags_offset(0), 0);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(0), 0);
|
||||
assert_eq!(mx_offset_to_member_offset(0), 4);
|
||||
assert_eq!(mx_offset_to_member_segment(1), 0);
|
||||
assert_eq!(mx_offset_to_member_page(1), 0);
|
||||
assert_eq!(mx_offset_to_flags_offset(1), 0);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(1), 8);
|
||||
assert_eq!(mx_offset_to_member_offset(1), 8);
|
||||
assert_eq!(mx_offset_to_member_segment(123456789), 2358);
|
||||
assert_eq!(mx_offset_to_member_page(123456789), 75462);
|
||||
assert_eq!(mx_offset_to_flags_offset(123456789), 4780);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(123456789), 8);
|
||||
assert_eq!(mx_offset_to_member_offset(123456789), 4788);
|
||||
assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040);
|
||||
assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285);
|
||||
assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16);
|
||||
assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172);
|
||||
assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040);
|
||||
assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285);
|
||||
assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24);
|
||||
assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,14 +49,16 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
||||
///
|
||||
/// Formats:
|
||||
///
|
||||
/// ```text
|
||||
/// <oid>
|
||||
/// <oid>_<fork name>
|
||||
/// <oid>.<segment number>
|
||||
/// <oid>_<fork name>.<segment number>
|
||||
/// ```
|
||||
///
|
||||
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
||||
///
|
||||
|
||||
@@ -5,11 +5,11 @@
|
||||
//! It is similar to what tokio_util::codec::Framed with appropriate codec
|
||||
//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
|
||||
//! separately without using split from futures::stream::StreamExt (which
|
||||
//! allocates box[1] in polling internally). tokio::io::split is used for splitting
|
||||
//! allocates a [Box] in polling internally). tokio::io::split is used for splitting
|
||||
//! instead. Plus we customize error messages more than a single type for all io
|
||||
//! calls.
|
||||
//!
|
||||
//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
||||
//! [Box]: https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
|
||||
use bytes::{Buf, BytesMut};
|
||||
use std::{
|
||||
future::Future,
|
||||
@@ -117,7 +117,7 @@ impl<S: AsyncWrite + Unpin> Framed<S> {
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
||||
/// Split into owned read and write parts. Beware of potential issues with
|
||||
/// using halves in different tasks on TLS stream:
|
||||
/// https://github.com/tokio-rs/tls/issues/40
|
||||
/// <https://github.com/tokio-rs/tls/issues/40>
|
||||
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
|
||||
let (read_half, write_half) = tokio::io::split(self.stream);
|
||||
let reader = FramedReader {
|
||||
|
||||
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
|
||||
#[derive(Debug)]
|
||||
pub struct FeCloseMessage;
|
||||
|
||||
/// An error occured while parsing or serializing raw stream into Postgres
|
||||
/// An error occurred while parsing or serializing raw stream into Postgres
|
||||
/// messages.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ProtocolError {
|
||||
@@ -934,6 +934,15 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||
let mut terminated = [0; 6];
|
||||
for (i, &elem) in code.iter().enumerate() {
|
||||
terminated[i] = elem;
|
||||
}
|
||||
|
||||
terminated
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -965,12 +974,3 @@ mod tests {
|
||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||
}
|
||||
}
|
||||
|
||||
fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
|
||||
let mut terminated = [0; 6];
|
||||
for (i, &elem) in code.iter().enumerate() {
|
||||
terminated[i] = elem;
|
||||
}
|
||||
|
||||
terminated
|
||||
}
|
||||
|
||||
@@ -34,12 +34,12 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
@@ -50,6 +50,12 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct RemotePath(PathBuf);
|
||||
|
||||
impl std::fmt::Display for RemotePath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0.display())
|
||||
}
|
||||
}
|
||||
|
||||
impl RemotePath {
|
||||
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
|
||||
anyhow::ensure!(
|
||||
@@ -59,6 +65,10 @@ impl RemotePath {
|
||||
Ok(Self(relative_path.to_path_buf()))
|
||||
}
|
||||
|
||||
pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
|
||||
Self::new(Path::new(relative_path))
|
||||
}
|
||||
|
||||
pub fn with_base(&self, base_path: &Path) -> PathBuf {
|
||||
base_path.join(&self.0)
|
||||
}
|
||||
|
||||
@@ -151,10 +151,7 @@ impl RemoteStorage for LocalFs {
|
||||
let mut files = vec![];
|
||||
let mut directory_queue = vec![full_path.clone()];
|
||||
|
||||
while !directory_queue.is_empty() {
|
||||
let cur_folder = directory_queue
|
||||
.pop()
|
||||
.expect("queue cannot be empty: we just checked");
|
||||
while let Some(cur_folder) = directory_queue.pop() {
|
||||
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let file_name: PathBuf = entry.file_name().into();
|
||||
|
||||
@@ -200,13 +200,17 @@ impl S3Bucket {
|
||||
)
|
||||
}
|
||||
|
||||
fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||
let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||
for segment in path.0.iter() {
|
||||
full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
full_path.push_str(segment.to_str().unwrap_or_default());
|
||||
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
let path_string = path
|
||||
.get_path()
|
||||
.to_string_lossy()
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.to_string();
|
||||
match &self.prefix_in_bucket {
|
||||
Some(prefix) => prefix.clone() + "/" + &path_string,
|
||||
None => path_string,
|
||||
}
|
||||
full_path
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
@@ -349,17 +353,10 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let mut folder_name = folder
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// remove leading "/" if one exists
|
||||
if let Some(folder_name_slash) = folder_name.clone() {
|
||||
if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
folder_name = Some(folder_name_slash[1..].to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
@@ -434,10 +431,12 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
// if prefix is not none then download file `prefix/from`
|
||||
// if prefix is none then download file `from`
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: self.relative_path_to_s3_object(from),
|
||||
..GetObjectRequest::default()
|
||||
range: None,
|
||||
})
|
||||
.await
|
||||
}
|
||||
@@ -530,3 +529,63 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::{RemotePath, S3Bucket, S3Config};
|
||||
|
||||
#[test]
|
||||
fn relative_path() {
|
||||
let all_paths = vec!["", "some/path", "some/path/"];
|
||||
let all_paths: Vec<RemotePath> = all_paths
|
||||
.iter()
|
||||
.map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
|
||||
.collect();
|
||||
let prefixes = [
|
||||
None,
|
||||
Some(""),
|
||||
Some("test/prefix"),
|
||||
Some("test/prefix/"),
|
||||
Some("/test/prefix/"),
|
||||
];
|
||||
let expected_outputs = vec![
|
||||
vec!["", "some/path", "some/path"],
|
||||
vec!["/", "/some/path", "/some/path"],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
];
|
||||
|
||||
for (prefix_idx, prefix) in prefixes.iter().enumerate() {
|
||||
let config = S3Config {
|
||||
bucket_name: "bucket".to_owned(),
|
||||
bucket_region: "region".to_owned(),
|
||||
prefix_in_bucket: prefix.map(str::to_string),
|
||||
endpoint: None,
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: Some(5),
|
||||
};
|
||||
let storage = S3Bucket::new(&config).expect("remote storage init");
|
||||
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
|
||||
let result = storage.relative_path_to_s3_object(test_path);
|
||||
let expected = expected_outputs[prefix_idx][test_path_idx];
|
||||
assert_eq!(result, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test/";
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
||||
|
||||
@@ -21,7 +21,7 @@ use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
|
||||
// 2. D+C+a+b
|
||||
// 3. D+A+B
|
||||
|
||||
/// [`Segment`] which has had it's size calculated.
|
||||
/// `Segment` which has had its size calculated.
|
||||
#[derive(Clone, Debug)]
|
||||
struct SegmentSize {
|
||||
method: SegmentMethod,
|
||||
|
||||
@@ -33,7 +33,7 @@ pub enum OtelName<'a> {
|
||||
/// directly into HTTP servers. However, I couldn't find one for Hyper,
|
||||
/// so I had to write our own. OpenTelemetry website has a registry of
|
||||
/// instrumentation libraries at:
|
||||
/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
|
||||
/// <https://opentelemetry.io/registry/?language=rust&component=instrumentation>
|
||||
/// If a Hyper crate appears, consider switching to that.
|
||||
pub async fn tracing_handler<F, R>(
|
||||
req: Request<Body>,
|
||||
|
||||
@@ -42,6 +42,10 @@ workspace_hack.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
tokio-stream = { version = "0.1.14" }
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::id::TenantId;
|
||||
/// Algorithm to use. We require EdDSA.
|
||||
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum Scope {
|
||||
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||
|
||||
@@ -12,6 +12,13 @@ pub struct Completion(mpsc::Sender<()>);
|
||||
#[derive(Clone)]
|
||||
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
|
||||
|
||||
impl Default for Barrier {
|
||||
fn default() -> Self {
|
||||
let (_, rx) = channel();
|
||||
rx
|
||||
}
|
||||
}
|
||||
|
||||
impl Barrier {
|
||||
pub async fn wait(self) {
|
||||
self.0.lock().await.recv().await;
|
||||
@@ -24,6 +31,15 @@ impl Barrier {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Barrier {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// we don't use dyn so this is good
|
||||
Arc::ptr_eq(&self.0, &other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Barrier {}
|
||||
|
||||
/// Create new Guard and Barrier pair.
|
||||
pub fn channel() -> (Completion, Barrier) {
|
||||
let (tx, rx) = mpsc::channel::<()>(1);
|
||||
|
||||
111
libs/utils/src/error.rs
Normal file
111
libs/utils/src/error.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
|
||||
///
|
||||
/// It can be used with `anyhow::Error` as well.
|
||||
///
|
||||
/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
|
||||
/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
|
||||
/// formatting.
|
||||
///
|
||||
/// ## Usage
|
||||
///
|
||||
/// ```rust
|
||||
/// #[derive(Debug, thiserror::Error)]
|
||||
/// enum MyCoolError {
|
||||
/// #[error("should never happen")]
|
||||
/// Bad(#[source] std::io::Error),
|
||||
/// }
|
||||
///
|
||||
/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
|
||||
///
|
||||
/// # fn main() {
|
||||
/// use utils::error::report_compact_sources;
|
||||
///
|
||||
/// if let Err(e) = failing_call() {
|
||||
/// let e = report_compact_sources(&e);
|
||||
/// assert_eq!(format!("{e}"), "should never happen: permission denied");
|
||||
/// }
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// ## TODO
|
||||
///
|
||||
/// When we are able to describe return position impl trait in traits, this should of course be an
|
||||
/// extension trait. Until then avoid boxing with this more ackward interface.
|
||||
pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
|
||||
struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
|
||||
|
||||
impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)?;
|
||||
|
||||
// why is E a generic parameter here? hope that rustc will see through a default
|
||||
// Error::source implementation and leave the following out if there cannot be any
|
||||
// sources:
|
||||
Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
|
||||
}
|
||||
}
|
||||
|
||||
struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
|
||||
|
||||
impl<'a> Iterator for Sources<'a> {
|
||||
type Item = &'a (dyn std::error::Error + 'static);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let rem = self.0;
|
||||
|
||||
let next = self.0.and_then(|x| x.source());
|
||||
self.0 = next;
|
||||
rem
|
||||
}
|
||||
}
|
||||
|
||||
AnyhowDisplayAlternateAlike(e)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::report_compact_sources;
|
||||
|
||||
#[test]
|
||||
fn report_compact_sources_examples() {
|
||||
use std::fmt::Write;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
enum EvictionError {
|
||||
#[error("cannot evict a remote layer")]
|
||||
CannotEvictRemoteLayer,
|
||||
#[error("stat failed")]
|
||||
StatFailed(#[source] std::io::Error),
|
||||
#[error("layer was no longer part of LayerMap")]
|
||||
LayerNotFound(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
let examples = [
|
||||
(
|
||||
line!(),
|
||||
EvictionError::CannotEvictRemoteLayer,
|
||||
"cannot evict a remote layer",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
|
||||
"stat failed: permission denied",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
|
||||
"layer was no longer part of LayerMap: foobar",
|
||||
),
|
||||
];
|
||||
|
||||
let mut s = String::new();
|
||||
|
||||
for (line, example, expected) in examples {
|
||||
s.clear();
|
||||
|
||||
write!(s, "{}", report_compact_sources(&example)).expect("string grows");
|
||||
|
||||
assert_eq!(s, expected, "example on line {line}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -24,12 +24,29 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
|
||||
Ok(dir.next_entry().await?.is_none())
|
||||
}
|
||||
|
||||
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
|
||||
where
|
||||
F: Fn() -> io::Result<()>,
|
||||
{
|
||||
fs_operation().or_else(ignore_not_found)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::fs_ext::is_directory_empty;
|
||||
|
||||
use super::ignore_absent_files;
|
||||
|
||||
#[test]
|
||||
fn is_empty_dir() {
|
||||
use super::PathExt;
|
||||
@@ -75,4 +92,21 @@ mod test {
|
||||
std::fs::remove_file(&file_path).unwrap();
|
||||
assert!(is_directory_empty(file_path).await.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignore_absent_files_works() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let dir_path = dir.path();
|
||||
|
||||
let file_path: PathBuf = dir_path.join("testfile");
|
||||
|
||||
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
|
||||
|
||||
let f = std::fs::File::create(&file_path).unwrap();
|
||||
drop(f);
|
||||
|
||||
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
|
||||
|
||||
assert!(!file_path.exists());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
@@ -148,26 +147,140 @@ impl Drop for RequestCancelled {
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
let mut buffer = vec![];
|
||||
let encoder = TextEncoder::new();
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
let metrics = tokio::task::spawn_blocking(move || {
|
||||
// Currently we take a lot of mutexes while collecting metrics, so it's
|
||||
// better to spawn a blocking task to avoid blocking the event loop.
|
||||
metrics::gather()
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
encoder.encode(&metrics, &mut buffer).unwrap();
|
||||
impl ChannelWriter {
|
||||
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
|
||||
let body = Body::wrap_stream(ReceiverStream::new(rx));
|
||||
|
||||
let mut writer = ChannelWriter::new(128 * 1024, tx);
|
||||
|
||||
let encoder = TextEncoder::new();
|
||||
|
||||
let response = Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, encoder.format_type())
|
||||
.body(Body::from(buffer))
|
||||
.body(body)
|
||||
.unwrap();
|
||||
|
||||
let span = info_span!("blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _span = span.entered();
|
||||
let metrics = metrics::gather();
|
||||
let res = encoder
|
||||
.encode(&metrics, &mut writer)
|
||||
.and_then(|_| writer.flush().map_err(|e| e.into()));
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
bytes = writer.flushed_bytes(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"responded /metrics"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("failed to write out /metrics response: {e:#}");
|
||||
// semantics of this error are quite... unclear. we want to error the stream out to
|
||||
// abort the response to somehow notify the client that we failed.
|
||||
//
|
||||
// though, most likely the reason for failure is that the receiver is already gone.
|
||||
drop(
|
||||
writer
|
||||
.tx
|
||||
.blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
/// Will be removed as part of https://github.com/neondatabase/neon/issues/4282
|
||||
/// Will be removed as part of <https://github.com/neondatabase/neon/issues/4282>
|
||||
pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
|
||||
request: &mut Request<Body>,
|
||||
) -> Result<Option<T>, ApiError> {
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use std::ffi::OsStr;
|
||||
use std::{fmt, str::FromStr};
|
||||
|
||||
use anyhow::Context;
|
||||
use hex::FromHex;
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -213,6 +215,18 @@ pub struct TimelineId(Id);
|
||||
|
||||
id_newtype!(TimelineId);
|
||||
|
||||
impl TryFrom<Option<&OsStr>> for TimelineId {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
|
||||
value
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| format!("Could not parse timeline id from {:?}", value))
|
||||
}
|
||||
}
|
||||
|
||||
/// Neon Tenant Id represents identifiar of a particular tenant.
|
||||
/// Is used for distinguishing requests and data belonging to different users.
|
||||
///
|
||||
|
||||
@@ -63,6 +63,9 @@ pub mod rate_limit;
|
||||
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
||||
pub mod completion;
|
||||
|
||||
/// Reporting utilities
|
||||
pub mod error;
|
||||
|
||||
mod failpoint_macro_helpers {
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
@@ -130,8 +133,8 @@ pub use failpoint_macro_helpers::failpoint_sleep_helper;
|
||||
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
||||
///
|
||||
/// #############################################################################################
|
||||
/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
|
||||
/// TODO this macro is not the way the library is intended to be used, see <https://github.com/neondatabase/neon/issues/1565> for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: <https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036>
|
||||
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
||||
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
||||
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
//! A module to create and read lock files.
|
||||
//!
|
||||
//! File locking is done using [`fcntl::flock`] exclusive locks.
|
||||
//! The only consumer of this module is currently [`pid_file`].
|
||||
//! See the module-level comment there for potential pitfalls
|
||||
//! with lock files that are used to store PIDs (pidfiles).
|
||||
//! The only consumer of this module is currently
|
||||
//! [`pid_file`](crate::pid_file). See the module-level comment
|
||||
//! there for potential pitfalls with lock files that are used
|
||||
//! to store PIDs (pidfiles).
|
||||
|
||||
use std::{
|
||||
fs,
|
||||
@@ -81,7 +82,7 @@ pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFi
|
||||
}
|
||||
|
||||
/// Returned by [`read_and_hold_lock_file`].
|
||||
/// Check out the [`pid_file`] module for what the variants mean
|
||||
/// Check out the [`pid_file`](crate::pid_file) module for what the variants mean
|
||||
/// and potential caveats if the lock files that are used to store PIDs.
|
||||
pub enum LockFileRead {
|
||||
/// No file exists at the given path.
|
||||
|
||||
@@ -112,7 +112,7 @@ pub fn init(
|
||||
///
|
||||
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
|
||||
/// If the assumptions about the initialization order are not held, use
|
||||
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// [`TracingPanicHookGuard::forget`] but keep in mind, if tracing is stopped, then panics will be
|
||||
/// lost.
|
||||
#[must_use]
|
||||
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
|
||||
|
||||
@@ -23,9 +23,9 @@ pub enum SeqWaitError {
|
||||
|
||||
/// Monotonically increasing value
|
||||
///
|
||||
/// It is handy to store some other fields under the same mutex in SeqWait<S>
|
||||
/// It is handy to store some other fields under the same mutex in `SeqWait<S>`
|
||||
/// (e.g. store prev_record_lsn). So we allow SeqWait to be parametrized with
|
||||
/// any type that can expose counter. <V> is the type of exposed counter.
|
||||
/// any type that can expose counter. `V` is the type of exposed counter.
|
||||
pub trait MonotonicCounter<V> {
|
||||
/// Bump counter value and check that it goes forward
|
||||
/// N.B.: new_val is an actual new value, not a difference.
|
||||
@@ -90,7 +90,7 @@ impl<T: Ord> Eq for Waiter<T> {}
|
||||
/// [`wait_for`]: SeqWait::wait_for
|
||||
/// [`advance`]: SeqWait::advance
|
||||
///
|
||||
/// <S> means Storage, <V> is type of counter that this storage exposes.
|
||||
/// `S` means Storage, `V` is type of counter that this storage exposes.
|
||||
///
|
||||
pub struct SeqWait<S, V>
|
||||
where
|
||||
|
||||
@@ -1,8 +1,15 @@
|
||||
//! Assert that the current [`tracing::Span`] has a given set of fields.
|
||||
//!
|
||||
//! Can only produce meaningful positive results when tracing has been configured as in example.
|
||||
//! Absence of `tracing_error::ErrorLayer` is not detected yet.
|
||||
//!
|
||||
//! `#[cfg(test)]` code will get a pass when using the `check_fields_present` macro in case tracing
|
||||
//! is completly unconfigured.
|
||||
//!
|
||||
//! # Usage
|
||||
//!
|
||||
//! ```
|
||||
//! ```rust
|
||||
//! # fn main() {
|
||||
//! use tracing_subscriber::prelude::*;
|
||||
//! let registry = tracing_subscriber::registry()
|
||||
//! .with(tracing_error::ErrorLayer::default());
|
||||
@@ -20,23 +27,18 @@
|
||||
//!
|
||||
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
|
||||
//! match check_fields_present([&extractor]) {
|
||||
//! Ok(()) => {},
|
||||
//! Err(missing) => {
|
||||
//! panic!("Missing fields: {:?}", missing.into_iter().map(|f| f.name() ).collect::<Vec<_>>());
|
||||
//! }
|
||||
//! if let Err(missing) = check_fields_present!([&extractor]) {
|
||||
//! // if you copypaste this to a custom assert method, remember to add #[track_caller]
|
||||
//! // to get the "user" code location for the panic.
|
||||
//! panic!("Missing fields: {missing:?}");
|
||||
//! }
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! Recommended reading: https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
//! Recommended reading: <https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering>
|
||||
//!
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::{self},
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ExtractionResult {
|
||||
Present,
|
||||
Absent,
|
||||
@@ -71,51 +73,103 @@ impl<const L: usize> Extractor for MultiNameExtractor<L> {
|
||||
}
|
||||
}
|
||||
|
||||
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
||||
|
||||
impl<'a> MemoryIdentity<'a> {
|
||||
fn as_ptr(&self) -> *const () {
|
||||
self.0 as *const _ as *const ()
|
||||
}
|
||||
}
|
||||
impl<'a> PartialEq for MemoryIdentity<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_ptr() == other.as_ptr()
|
||||
}
|
||||
}
|
||||
impl<'a> Eq for MemoryIdentity<'a> {}
|
||||
impl<'a> Hash for MemoryIdentity<'a> {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.as_ptr().hash(state);
|
||||
}
|
||||
}
|
||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||
}
|
||||
}
|
||||
|
||||
/// The extractor names passed as keys to [`new`].
|
||||
pub fn check_fields_present<const L: usize>(
|
||||
/// Checks that the given extractors are satisfied with the current span hierarchy.
|
||||
///
|
||||
/// This should not be called directly, but used through [`check_fields_present`] which allows
|
||||
/// `Summary::Unconfigured` only when the calling crate is being `#[cfg(test)]` as a conservative default.
|
||||
#[doc(hidden)]
|
||||
pub fn check_fields_present0<const L: usize>(
|
||||
must_be_present: [&dyn Extractor; L],
|
||||
) -> Result<(), Vec<&dyn Extractor>> {
|
||||
let mut missing: HashSet<MemoryIdentity> =
|
||||
HashSet::from_iter(must_be_present.into_iter().map(|r| MemoryIdentity(r)));
|
||||
) -> Result<Summary, Vec<&dyn Extractor>> {
|
||||
let mut missing = must_be_present.into_iter().collect::<Vec<_>>();
|
||||
let trace = tracing_error::SpanTrace::capture();
|
||||
trace.with_spans(|md, _formatted_fields| {
|
||||
missing.retain(|extractor| match extractor.0.extract(md.fields()) {
|
||||
// when trying to understand the inner workings of how does the matching work, note that
|
||||
// this closure might be called zero times if the span is disabled. normally it is called
|
||||
// once per span hierarchy level.
|
||||
missing.retain(|extractor| match extractor.extract(md.fields()) {
|
||||
ExtractionResult::Present => false,
|
||||
ExtractionResult::Absent => true,
|
||||
});
|
||||
!missing.is_empty() // continue walking up until we've found all missing
|
||||
|
||||
// continue walking up until we've found all missing
|
||||
!missing.is_empty()
|
||||
});
|
||||
if missing.is_empty() {
|
||||
Ok(())
|
||||
Ok(Summary::FoundEverything)
|
||||
} else if !tracing_subscriber_configured() {
|
||||
Ok(Summary::Unconfigured)
|
||||
} else {
|
||||
Err(missing.into_iter().map(|mi| mi.0).collect())
|
||||
// we can still hit here if a tracing subscriber has been configured but the ErrorLayer is
|
||||
// missing, which can be annoying. for this case, we could probably use
|
||||
// SpanTrace::status().
|
||||
//
|
||||
// another way to end up here is with RUST_LOG=pageserver=off while configuring the
|
||||
// logging, though I guess in that case the SpanTrace::status() == EMPTY would be valid.
|
||||
// this case is covered by test `not_found_if_tracing_error_subscriber_has_wrong_filter`.
|
||||
Err(missing)
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks that the given extractors are satisfied with the current span hierarchy.
|
||||
///
|
||||
/// The macro is the preferred way of checking if fields exist while passing checks if a test does
|
||||
/// not have tracing configured.
|
||||
///
|
||||
/// Why mangled name? Because #[macro_export] will expose it at utils::__check_fields_present.
|
||||
/// However we can game a module namespaced macro for `use` purposes by re-exporting the
|
||||
/// #[macro_export] exported name with an alias (below).
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! __check_fields_present {
|
||||
($extractors:expr) => {{
|
||||
{
|
||||
use $crate::tracing_span_assert::{check_fields_present0, Summary::*, Extractor};
|
||||
|
||||
match check_fields_present0($extractors) {
|
||||
Ok(FoundEverything) => Ok(()),
|
||||
Ok(Unconfigured) if cfg!(test) => {
|
||||
// allow unconfigured in tests
|
||||
Ok(())
|
||||
},
|
||||
Ok(Unconfigured) => {
|
||||
panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
|
||||
},
|
||||
Err(missing) => Err(missing)
|
||||
}
|
||||
}
|
||||
}}
|
||||
}
|
||||
|
||||
pub use crate::__check_fields_present as check_fields_present;
|
||||
|
||||
/// Explanation for why the check was deemed ok.
|
||||
///
|
||||
/// Mainly useful for testing, or configuring per-crate behaviour as in with
|
||||
/// [`check_fields_present`].
|
||||
#[derive(Debug)]
|
||||
pub enum Summary {
|
||||
/// All extractors were found.
|
||||
///
|
||||
/// Should only happen when tracing is properly configured.
|
||||
FoundEverything,
|
||||
|
||||
/// Tracing has not been configured at all. This is ok for tests running without tracing set
|
||||
/// up.
|
||||
Unconfigured,
|
||||
}
|
||||
|
||||
fn tracing_subscriber_configured() -> bool {
|
||||
let mut noop_configured = false;
|
||||
tracing::dispatcher::get_default(|d| {
|
||||
// it is possible that this closure will not be invoked, but the current implementation
|
||||
// always invokes it
|
||||
noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
|
||||
});
|
||||
|
||||
!noop_configured
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -123,6 +177,36 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
fmt::{self},
|
||||
hash::{Hash, Hasher},
|
||||
};
|
||||
|
||||
struct MemoryIdentity<'a>(&'a dyn Extractor);
|
||||
|
||||
impl<'a> MemoryIdentity<'a> {
|
||||
fn as_ptr(&self) -> *const () {
|
||||
self.0 as *const _ as *const ()
|
||||
}
|
||||
}
|
||||
impl<'a> PartialEq for MemoryIdentity<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_ptr() == other.as_ptr()
|
||||
}
|
||||
}
|
||||
impl<'a> Eq for MemoryIdentity<'a> {}
|
||||
impl<'a> Hash for MemoryIdentity<'a> {
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.as_ptr().hash(state);
|
||||
}
|
||||
}
|
||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||
}
|
||||
}
|
||||
|
||||
struct Setup {
|
||||
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
|
||||
tenant_extractor: MultiNameExtractor<2>,
|
||||
@@ -159,7 +243,8 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
||||
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -167,8 +252,8 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let missing =
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap_err();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor])
|
||||
.unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -185,7 +270,8 @@ mod tests {
|
||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
check_fields_present([&setup.tenant_extractor, &setup.timeline_extractor]).unwrap();
|
||||
let res = check_fields_present0([&setup.tenant_extractor, &setup.timeline_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -198,7 +284,7 @@ mod tests {
|
||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -207,7 +293,8 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", tenant_id = "tenant-1", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
||||
let res = check_fields_present0([&setup.tenant_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -223,7 +310,8 @@ mod tests {
|
||||
let span = tracing::info_span!("grandchild", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
check_fields_present([&setup.tenant_extractor]).unwrap();
|
||||
let res = check_fields_present0([&setup.tenant_extractor]);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -231,7 +319,7 @@ mod tests {
|
||||
let setup = setup_current_thread();
|
||||
let span = tracing::info_span!("root", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
@@ -245,43 +333,107 @@ mod tests {
|
||||
let span = tracing::info_span!("child", timeline_id = "timeline-1");
|
||||
let _guard = span.enter();
|
||||
|
||||
let missing = check_fields_present([&setup.tenant_extractor]).unwrap_err();
|
||||
let missing = check_fields_present0([&setup.tenant_extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&setup.tenant_extractor]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tracing_error_subscriber_not_set_up() {
|
||||
fn tracing_error_subscriber_not_set_up_straight_line() {
|
||||
// no setup
|
||||
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&extractor]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
|
||||
// similarly for a not found key
|
||||
let extractor = MultiNameExtractor::new("F", ["foobar"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn panics_if_tracing_error_subscriber_has_wrong_filter() {
|
||||
fn tracing_error_subscriber_not_set_up_with_instrument() {
|
||||
// no setup
|
||||
|
||||
// demo a case where span entering is used to establish a parent child connection, but
|
||||
// when we re-enter the subspan SpanTrace::with_spans iterates over nothing.
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let subspan = tracing::info_span!("bar", f = "foobar");
|
||||
drop(_guard);
|
||||
|
||||
// normally this would work, but without any tracing-subscriber configured, both
|
||||
// check_field_present find nothing
|
||||
let _guard = subspan.enter();
|
||||
let extractors: [&dyn Extractor; 2] = [
|
||||
&MultiNameExtractor::new("E", ["e"]),
|
||||
&MultiNameExtractor::new("F", ["f"]),
|
||||
];
|
||||
|
||||
let res = check_fields_present0(extractors);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
|
||||
// similarly for a not found key
|
||||
let extractor = MultiNameExtractor::new("G", ["g"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tracing_subscriber_configured() {
|
||||
// this will fail if any utils::logging::init callers appear, but let's hope they do not
|
||||
// appear.
|
||||
assert!(!super::tracing_subscriber_configured());
|
||||
|
||||
let _g = setup_current_thread();
|
||||
|
||||
assert!(super::tracing_subscriber_configured());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_found_when_disabled_by_filter() {
|
||||
let r = tracing_subscriber::registry().with({
|
||||
tracing_error::ErrorLayer::default().with_filter(
|
||||
tracing_subscriber::filter::dynamic_filter_fn(|md, _| {
|
||||
if md.is_span() && *md.level() == tracing::Level::INFO {
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}),
|
||||
)
|
||||
tracing_error::ErrorLayer::default().with_filter(tracing_subscriber::filter::filter_fn(
|
||||
|md| !(md.is_span() && *md.level() == tracing::Level::INFO),
|
||||
))
|
||||
});
|
||||
|
||||
let _guard = tracing::subscriber::set_default(r);
|
||||
|
||||
// this test is a rather tricky one, it has a number of possible outcomes depending on the
|
||||
// execution order when executed with other tests even if no test sets the global default
|
||||
// subscriber.
|
||||
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let missing = check_fields_present([&extractor]).unwrap_err();
|
||||
assert_missing(missing, vec![&extractor]);
|
||||
let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
|
||||
|
||||
if span.is_disabled() {
|
||||
// the tests are running single threaded, or we got lucky and no other tests subscriber
|
||||
// was got to register their per-CALLSITE::META interest between `set_default` and
|
||||
// creation of the span, thus the filter got to apply and registered interest of Never,
|
||||
// so the span was never created.
|
||||
//
|
||||
// as the span is disabled, no keys were recorded to it, leading check_fields_present0
|
||||
// to find an error.
|
||||
|
||||
let missing = check_fields_present0(extractors).unwrap_err();
|
||||
assert_missing(missing, vec![extractors[0]]);
|
||||
} else {
|
||||
// when the span is enabled, it is because some other test is running at the same time,
|
||||
// and that tests registry has filters which are interested in our above span.
|
||||
//
|
||||
// because the span is now enabled, all keys will be found for it. the
|
||||
// tracing_error::SpanTrace does not consider layer filters during the span hierarchy
|
||||
// walk (SpanTrace::with_spans), nor is the SpanTrace::status a reliable indicator in
|
||||
// this test-induced issue.
|
||||
|
||||
let res = check_fields_present0(extractors);
|
||||
assert!(matches!(res, Ok(Summary::FoundEverything)), "{res:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +35,8 @@ humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
nix.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
num_cpus = { version = "1.15" }
|
||||
num-traits.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
@@ -82,6 +84,7 @@ strum_macros.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tempfile.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
|
||||
@@ -13,6 +13,7 @@ clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
postgres_ffi.workspace = true
|
||||
tokio.workspace = true
|
||||
utils.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -7,10 +7,10 @@
|
||||
//! - The y axis represents LSN, growing upwards.
|
||||
//!
|
||||
//! Coordinates in both axis are compressed for better readability.
|
||||
//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
|
||||
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
||||
//!
|
||||
//! Example use:
|
||||
//! ```
|
||||
//! ```bash
|
||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||
//! $ firefox out.svg
|
||||
@@ -20,7 +20,7 @@
|
||||
//! or from pageserver log files.
|
||||
//!
|
||||
//! TODO Consider shipping this as a grafana panel plugin:
|
||||
//! https://grafana.com/tutorials/build-a-panel-plugin/
|
||||
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
|
||||
use anyhow::Result;
|
||||
use pageserver::repository::Key;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
}
|
||||
|
||||
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
||||
fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
let file = FileBlockReader::new(VirtualFile::open(path)?);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
@@ -129,7 +129,7 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
Ok(holes)
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let storage_path = &cmd.path;
|
||||
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
|
||||
|
||||
@@ -160,7 +160,7 @@ pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if layer_file.is_delta {
|
||||
layer_file.holes = get_holes(&layer.path(), max_holes)?;
|
||||
layer_file.holes = get_holes(&layer.path(), max_holes).await?;
|
||||
n_deltas += 1;
|
||||
}
|
||||
layers.push(layer_file);
|
||||
|
||||
@@ -43,8 +43,7 @@ pub(crate) enum LayerCmd {
|
||||
},
|
||||
}
|
||||
|
||||
fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
use pageserver::tenant::blob_io::BlobCursor;
|
||||
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
use pageserver::tenant::block_io::BlockReader;
|
||||
|
||||
let path = path.as_ref();
|
||||
@@ -69,7 +68,7 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
true
|
||||
},
|
||||
)?;
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
let cursor = BlockCursor::new(&file);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos())?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
@@ -78,7 +77,7 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
match cmd {
|
||||
LayerCmd::List { path } => {
|
||||
for tenant in fs::read_dir(path.join("tenants"))? {
|
||||
@@ -153,7 +152,7 @@ pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
);
|
||||
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(layer.path())?;
|
||||
read_delta_file(layer.path()).await?;
|
||||
} else {
|
||||
anyhow::bail!("not supported yet :(");
|
||||
}
|
||||
|
||||
@@ -72,12 +72,13 @@ struct AnalyzeLayerMapCmd {
|
||||
max_holes: Option<usize>,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
match cli.command {
|
||||
Commands::Layer(cmd) => {
|
||||
layers::main(&cmd)?;
|
||||
layers::main(&cmd).await?;
|
||||
}
|
||||
Commands::Metadata(cmd) => {
|
||||
handle_metadata(&cmd)?;
|
||||
@@ -86,7 +87,7 @@ fn main() -> anyhow::Result<()> {
|
||||
draw_timeline_dir::main()?;
|
||||
}
|
||||
Commands::AnalyzeLayerMap(cmd) => {
|
||||
layer_map_analyzer::main(&cmd)?;
|
||||
layer_map_analyzer::main(&cmd).await?;
|
||||
}
|
||||
Commands::PrintLayerFile(cmd) => {
|
||||
if let Err(e) = read_pg_control_file(&cmd.path) {
|
||||
@@ -94,7 +95,7 @@ fn main() -> anyhow::Result<()> {
|
||||
"Failed to read input file as a pg control one: {e:#}\n\
|
||||
Attempting to read it as layer file"
|
||||
);
|
||||
print_layerfile(&cmd.path)?;
|
||||
print_layerfile(&cmd.path).await?;
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -113,12 +114,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx)
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
}
|
||||
|
||||
fn handle_metadata(
|
||||
|
||||
@@ -19,12 +19,6 @@ use tokio::io;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tracing::*;
|
||||
|
||||
/// NB: This relies on a modified version of tokio_tar that does *not* write the
|
||||
/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
|
||||
/// without explicitly calling 'finish' or 'into_inner'!
|
||||
///
|
||||
/// See https://github.com/neondatabase/tokio-tar/pull/1
|
||||
///
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
|
||||
@@ -396,8 +396,8 @@ fn start_pageserver(
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
|
||||
let init_sizes_done = tokio::select! {
|
||||
_ = &mut init_sizes_done => {
|
||||
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
|
||||
Ok(_) => {
|
||||
let now = std::time::Instant::now();
|
||||
tracing::info!(
|
||||
from_init_done_millis = (now - init_done).as_millis(),
|
||||
@@ -406,7 +406,7 @@ fn start_pageserver(
|
||||
);
|
||||
None
|
||||
}
|
||||
_ = tokio::time::sleep(timeout) => {
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
"Initial logical size timeout elapsed; starting background jobs"
|
||||
|
||||
@@ -33,7 +33,8 @@ use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
pub mod defaults {
|
||||
@@ -171,11 +172,13 @@ pub struct PageServerConf {
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
||||
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
|
||||
/// See the comment in `eviction_task` for details.
|
||||
///
|
||||
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
@@ -599,6 +602,17 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn timeline_delete_mark_file_path(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> PathBuf {
|
||||
path_with_suffix_extension(
|
||||
self.timeline_path(&tenant_id, &timeline_id),
|
||||
TIMELINE_DELETE_MARK_SUFFIX,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
@@ -993,6 +1007,8 @@ impl ConfigurableSemaphore {
|
||||
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
|
||||
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
|
||||
/// behave like [`futures::future::pending`], just waiting until new permits are added.
|
||||
///
|
||||
/// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
|
||||
pub fn new(initial_permits: NonZeroUsize) -> Self {
|
||||
ConfigurableSemaphore {
|
||||
initial_permits,
|
||||
|
||||
@@ -7,27 +7,23 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::{mgr, LogicalSizeCalculationCause};
|
||||
use anyhow;
|
||||
use chrono::Utc;
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use pageserver_api::models::TenantState;
|
||||
use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tracing::*;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
const WRITTEN_SIZE: &str = "written_size";
|
||||
const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
|
||||
const RESIDENT_SIZE: &str = "resident_size";
|
||||
const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Debug)]
|
||||
#[derive(Serialize, Debug, Clone, Copy)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
@@ -38,10 +34,142 @@ struct Ids {
|
||||
|
||||
/// Key that uniquely identifies the object, this metric describes.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct PageserverConsumptionMetricsKey {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub metric: &'static str,
|
||||
struct MetricsKey {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
metric: &'static str,
|
||||
}
|
||||
|
||||
impl MetricsKey {
|
||||
const fn absolute_values(self) -> AbsoluteValueFactory {
|
||||
AbsoluteValueFactory(self)
|
||||
}
|
||||
const fn incremental_values(self) -> IncrementalValueFactory {
|
||||
IncrementalValueFactory(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper type which each individual metric kind can return to produce only absolute values.
|
||||
struct AbsoluteValueFactory(MetricsKey);
|
||||
|
||||
impl AbsoluteValueFactory {
|
||||
fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
|
||||
let key = self.0;
|
||||
(key, (EventType::Absolute { time }, val))
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper type which each individual metric kind can return to produce only incremental values.
|
||||
struct IncrementalValueFactory(MetricsKey);
|
||||
|
||||
impl IncrementalValueFactory {
|
||||
#[allow(clippy::wrong_self_convention)]
|
||||
fn from_previous_up_to(
|
||||
self,
|
||||
prev_end: DateTime<Utc>,
|
||||
up_to: DateTime<Utc>,
|
||||
val: u64,
|
||||
) -> (MetricsKey, (EventType, u64)) {
|
||||
let key = self.0;
|
||||
// cannot assert prev_end < up_to because these are realtime clock based
|
||||
(
|
||||
key,
|
||||
(
|
||||
EventType::Incremental {
|
||||
start_time: prev_end,
|
||||
stop_time: up_to,
|
||||
},
|
||||
val,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn key(&self) -> &MetricsKey {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
// the static part of a MetricsKey
|
||||
impl MetricsKey {
|
||||
/// Absolute value of [`Timeline::get_last_record_lsn`].
|
||||
///
|
||||
/// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
|
||||
const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: "written_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
|
||||
/// previously sent, starting from the previously sent incremental time range ending at the
|
||||
/// latest absolute measurement.
|
||||
const fn written_size_delta(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> IncrementalValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
// the name here is correctly about data not size, because that is what is wanted by
|
||||
// downstream pipeline
|
||||
metric: "written_data_bytes_delta",
|
||||
}
|
||||
.incremental_values()
|
||||
}
|
||||
|
||||
/// Exact [`Timeline::get_current_logical_size`].
|
||||
///
|
||||
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
|
||||
const fn timeline_logical_size(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: "timeline_logical_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`Tenant::remote_size`]
|
||||
///
|
||||
/// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
|
||||
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "remote_storage_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
|
||||
///
|
||||
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
|
||||
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "resident_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
|
||||
///
|
||||
/// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
|
||||
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: "synthetic_storage_size",
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
}
|
||||
|
||||
/// Main thread that serves metrics collection
|
||||
@@ -79,7 +207,7 @@ pub async fn collect_metrics(
|
||||
.timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
|
||||
.build()
|
||||
.expect("Failed to create http client with timeout");
|
||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
||||
let mut cached_metrics = HashMap::new();
|
||||
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
|
||||
|
||||
loop {
|
||||
@@ -119,15 +247,15 @@ pub async fn collect_metrics(
|
||||
///
|
||||
/// TODO
|
||||
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
|
||||
pub async fn collect_metrics_iteration(
|
||||
async fn collect_metrics_iteration(
|
||||
client: &reqwest::Client,
|
||||
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
|
||||
cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
node_id: NodeId,
|
||||
ctx: &RequestContext,
|
||||
send_cached: bool,
|
||||
) {
|
||||
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
|
||||
let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
|
||||
trace!(
|
||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
@@ -161,99 +289,65 @@ pub async fn collect_metrics_iteration(
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
// iterate through list of timelines in tenant
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
for timeline in tenant.list_timelines() {
|
||||
// collect per-timeline metrics only for active timelines
|
||||
if timeline.is_active() {
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
match TimelineSnapshot::collect(&timeline, ctx) {
|
||||
Ok(Some(snap)) => {
|
||||
snap.to_metrics(
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: WRITTEN_SIZE,
|
||||
},
|
||||
timeline_written_size,
|
||||
));
|
||||
|
||||
let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
|
||||
match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
Ok((size, is_exact)) if is_exact => {
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: TIMELINE_LOGICAL_SIZE,
|
||||
},
|
||||
size,
|
||||
));
|
||||
}
|
||||
Ok((_, _)) => {}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"failed to get current logical size for timeline {}: {err:?}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
timeline_id,
|
||||
Utc::now(),
|
||||
&mut current_metrics,
|
||||
cached_metrics,
|
||||
);
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(e) => {
|
||||
error!(
|
||||
"failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
|
||||
timeline.timeline_id
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_resident_size = timeline.get_resident_physical_size();
|
||||
tenant_resident_size += timeline_resident_size;
|
||||
tenant_resident_size += timeline.resident_physical_size();
|
||||
}
|
||||
|
||||
match tenant.get_remote_size().await {
|
||||
Ok(tenant_remote_size) => {
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: REMOTE_STORAGE_SIZE,
|
||||
},
|
||||
tenant_remote_size,
|
||||
));
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"failed to get remote size for tenant {}: {err:?}",
|
||||
tenant_id
|
||||
);
|
||||
}
|
||||
}
|
||||
current_metrics
|
||||
.push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
|
||||
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: RESIDENT_SIZE,
|
||||
},
|
||||
tenant_resident_size,
|
||||
));
|
||||
current_metrics
|
||||
.push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
|
||||
|
||||
// Note that this metric is calculated in a separate bgworker
|
||||
// Here we only use cached value, which may lag behind the real latest one
|
||||
let tenant_synthetic_size = tenant.get_cached_synthetic_size();
|
||||
let synthetic_size = tenant.cached_synthetic_size();
|
||||
|
||||
if tenant_synthetic_size != 0 {
|
||||
if synthetic_size != 0 {
|
||||
// only send non-zeroes because otherwise these show up as errors in logs
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: SYNTHETIC_STORAGE_SIZE,
|
||||
},
|
||||
tenant_synthetic_size,
|
||||
));
|
||||
current_metrics
|
||||
.push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
|
||||
}
|
||||
}
|
||||
|
||||
// Filter metrics, unless we want to send all metrics, including cached ones.
|
||||
// See: https://github.com/neondatabase/neon/issues/3485
|
||||
if !send_cached {
|
||||
current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
|
||||
Some(val) => val != curr_val,
|
||||
None => true,
|
||||
current_metrics.retain(|(curr_key, (kind, curr_val))| {
|
||||
if kind.is_incremental() {
|
||||
// incremental values (currently only written_size_delta) should not get any cache
|
||||
// deduplication because they will be used by upstream for "is still alive."
|
||||
true
|
||||
} else {
|
||||
match cached_metrics.get(curr_key) {
|
||||
Some((_, val)) => val != curr_val,
|
||||
None => true,
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -268,14 +362,16 @@ pub async fn collect_metrics_iteration(
|
||||
|
||||
let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);
|
||||
|
||||
let node_id = node_id.to_string();
|
||||
|
||||
for chunk in chunks {
|
||||
chunk_to_send.clear();
|
||||
|
||||
// enrich metrics with type,timestamp and idempotency key before sending
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
|
||||
kind: EventType::Absolute { time: Utc::now() },
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
|
||||
kind: *when,
|
||||
metric: curr_key.metric,
|
||||
idempotency_key: idempotency_key(node_id.to_string()),
|
||||
idempotency_key: idempotency_key(&node_id),
|
||||
value: *curr_val,
|
||||
extra: Ids {
|
||||
tenant_id: curr_key.tenant_id,
|
||||
@@ -283,17 +379,14 @@ pub async fn collect_metrics_iteration(
|
||||
},
|
||||
}));
|
||||
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
|
||||
events: &chunk_to_send,
|
||||
})
|
||||
.expect("PageserverConsumptionMetric should not fail serialization");
|
||||
|
||||
const MAX_RETRIES: u32 = 3;
|
||||
|
||||
for attempt in 0..MAX_RETRIES {
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk_json)
|
||||
.json(&EventChunk {
|
||||
events: (&chunk_to_send).into(),
|
||||
})
|
||||
.send()
|
||||
.await;
|
||||
|
||||
@@ -329,6 +422,130 @@ pub async fn collect_metrics_iteration(
|
||||
}
|
||||
}
|
||||
|
||||
/// Internal type to make timeline metric production testable.
|
||||
///
|
||||
/// As this value type contains all of the information needed from a timeline to produce the
|
||||
/// metrics, it can easily be created with different values in test.
|
||||
struct TimelineSnapshot {
|
||||
loaded_at: (Lsn, SystemTime),
|
||||
last_record_lsn: Lsn,
|
||||
current_exact_logical_size: Option<u64>,
|
||||
}
|
||||
|
||||
impl TimelineSnapshot {
|
||||
/// Collect the metrics from an actual timeline.
|
||||
///
|
||||
/// Fails currently only when [`Timeline::get_current_logical_size`] fails.
|
||||
///
|
||||
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
|
||||
fn collect(
|
||||
t: &Arc<crate::tenant::Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<Self>> {
|
||||
use anyhow::Context;
|
||||
|
||||
if !t.is_active() {
|
||||
// no collection for broken or stopping needed, we will still keep the cached values
|
||||
// though at the caller.
|
||||
Ok(None)
|
||||
} else {
|
||||
let loaded_at = t.loaded_at;
|
||||
let last_record_lsn = t.get_last_record_lsn();
|
||||
|
||||
let current_exact_logical_size = {
|
||||
let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
|
||||
let res = span
|
||||
.in_scope(|| t.get_current_logical_size(ctx))
|
||||
.context("get_current_logical_size");
|
||||
match res? {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
(size, is_exact) if is_exact => Some(size),
|
||||
(_, _) => None,
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(TimelineSnapshot {
|
||||
loaded_at,
|
||||
last_record_lsn,
|
||||
current_exact_logical_size,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
/// Produce the timeline consumption metrics into the `metrics` argument.
|
||||
fn to_metrics(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
|
||||
cache: &HashMap<MetricsKey, (EventType, u64)>,
|
||||
) {
|
||||
let timeline_written_size = u64::from(self.last_record_lsn);
|
||||
|
||||
let (key, written_size_now) =
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
|
||||
|
||||
// last_record_lsn can only go up, right now at least, TODO: #2592 or related
|
||||
// features might change this.
|
||||
|
||||
let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
|
||||
|
||||
// use this when available, because in a stream of incremental values, it will be
|
||||
// accurate where as when last_record_lsn stops moving, we will only cache the last
|
||||
// one of those.
|
||||
let last_stop_time = cache
|
||||
.get(written_size_delta_key.key())
|
||||
.map(|(until, _val)| {
|
||||
until
|
||||
.incremental_timerange()
|
||||
.expect("never create EventType::Absolute for written_size_delta")
|
||||
.end
|
||||
});
|
||||
|
||||
// by default, use the last sent written_size as the basis for
|
||||
// calculating the delta. if we don't yet have one, use the load time value.
|
||||
let prev = cache
|
||||
.get(&key)
|
||||
.map(|(prev_at, prev)| {
|
||||
// use the prev time from our last incremental update, or default to latest
|
||||
// absolute update on the first round.
|
||||
let prev_at = prev_at
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let prev_at = last_stop_time.unwrap_or(prev_at);
|
||||
(*prev_at, *prev)
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
// if we don't have a previous point of comparison, compare to the load time
|
||||
// lsn.
|
||||
let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
|
||||
(DateTime::from(*loaded_at), disk_consistent_lsn.0)
|
||||
});
|
||||
|
||||
// written_size_bytes_delta
|
||||
metrics.extend(
|
||||
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
|
||||
let up_to = written_size_now
|
||||
.0
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
|
||||
Some(key_value)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
|
||||
// written_size
|
||||
metrics.push((key, written_size_now));
|
||||
|
||||
if let Some(size) = self.current_exact_logical_size {
|
||||
metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Caclculate synthetic size for each active tenant
|
||||
pub async fn calculate_synthetic_size_worker(
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
@@ -343,7 +560,7 @@ pub async fn calculate_synthetic_size_worker(
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
return Ok(());
|
||||
},
|
||||
tick_at = ticker.tick() => {
|
||||
tick_at = ticker.tick() => {
|
||||
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
@@ -379,3 +596,149 @@ pub async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use std::time::SystemTime;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::consumption_metrics::MetricsKey;
|
||||
|
||||
use super::TimelineSnapshot;
|
||||
use chrono::{DateTime, Utc};
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_before_advancing() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::new();
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, SystemTime::now()),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
let now = DateTime::<Utc>::from(SystemTime::now());
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
snap.loaded_at.1.into(),
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_second_round() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let [now, before, init] = time_backwards();
|
||||
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
let before = DateTime::<Utc>::from(before);
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_previous_up_to(before, now, 0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let [now, just_before, before, init] = time_backwards();
|
||||
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
let just_before = DateTime::<Utc>::from(just_before);
|
||||
let before = DateTime::<Utc>::from(before);
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
// at t=before was the last time the last_record_lsn changed
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
|
||||
// end time of this event is used for the next ones
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
before,
|
||||
just_before,
|
||||
0,
|
||||
),
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
|
||||
just_before,
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
|
||||
let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
|
||||
times[0] = std::time::SystemTime::now();
|
||||
for behind in 1..N {
|
||||
times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
|
||||
}
|
||||
|
||||
times
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,6 +179,9 @@ impl RequestContext {
|
||||
/// a context and you are unwilling to change all callers to provide one.
|
||||
///
|
||||
/// Before we add cancellation, we should get rid of this method.
|
||||
///
|
||||
/// [`attached_child`]: Self::attached_child
|
||||
/// [`detached_child`]: Self::detached_child
|
||||
pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
Self::new(task_kind, download_behavior)
|
||||
}
|
||||
|
||||
@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{self, storage_layer::PersistentLayer, Timeline},
|
||||
tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -166,11 +166,11 @@ async fn disk_usage_eviction_task(
|
||||
.await;
|
||||
|
||||
let sleep_until = start + task_config.period;
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep_until(sleep_until) => {},
|
||||
_ = cancel.cancelled() => {
|
||||
break
|
||||
}
|
||||
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -390,13 +390,22 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
match result {
|
||||
Some(Ok(true)) => {
|
||||
Some(Ok(())) => {
|
||||
usage_assumed.add_available_bytes(layer.file_size());
|
||||
}
|
||||
Some(Ok(false)) => {
|
||||
// this is:
|
||||
// - Replacement::{NotFound, Unexpected}
|
||||
// - it cannot be is_remote_layer, filtered already
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
|
||||
}
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
evictions_failed.file_sizes += layer.file_size();
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
Some(Err(
|
||||
e @ EvictionError::LayerNotFound(_)
|
||||
| e @ EvictionError::StatFailed(_),
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(%layer, "failed to evict layer: {e}");
|
||||
evictions_failed.file_sizes += layer.file_size();
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
@@ -404,10 +413,6 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
assert!(cancel.is_cancelled());
|
||||
return;
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
// we really shouldn't be getting this, precondition failure
|
||||
error!("failed to evict layer: {:#}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -540,12 +545,12 @@ async fn collect_eviction_candidates(
|
||||
// We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
|
||||
// That's what's typically used by the various background loops.
|
||||
//
|
||||
// The default can be overriden with a fixed value in the tenant conf.
|
||||
// The default can be overridden with a fixed value in the tenant conf.
|
||||
// A default override can be put in the default tenant conf in the pageserver.toml.
|
||||
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
overriden_size=s,
|
||||
overridden_size=s,
|
||||
"using overridden min resident size for tenant"
|
||||
);
|
||||
s
|
||||
|
||||
@@ -994,31 +994,29 @@ async fn timeline_gc_handler(
|
||||
// Run compaction immediately on given timeline.
|
||||
async fn timeline_compact_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
|
||||
.await
|
||||
.context("spawn compaction task")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let result: anyhow::Result<()> = result_receiver
|
||||
.await
|
||||
.context("receive compaction result")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
result.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
timeline
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
async fn timeline_checkpoint_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -1031,13 +1029,13 @@ async fn timeline_checkpoint_handler(
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact(&ctx)
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
||||
.instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -1143,7 +1141,7 @@ async fn disk_usage_eviction_run(
|
||||
let Some(storage) = state.remote_storage.clone() else {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)))
|
||||
)));
|
||||
};
|
||||
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
@@ -109,6 +109,8 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
|
||||
@@ -123,15 +125,30 @@ pub fn is_temporary(path: &Path) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
|
||||
match path.file_name() {
|
||||
Some(name) => name
|
||||
.to_string_lossy()
|
||||
.ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
|
||||
Some(name) => name.to_string_lossy().ends_with(suffix),
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
pub fn is_delete_mark(path: &Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
|
||||
if let Some(e) = e.io_error() {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
|
||||
/// blocking.
|
||||
///
|
||||
|
||||
@@ -6,7 +6,6 @@ use metrics::{
|
||||
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::TenantState;
|
||||
use strum::VariantNames;
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -74,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_storage_operations_seconds_global",
|
||||
"Time spent on storage operations",
|
||||
@@ -84,18 +83,17 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_read_num_fs_layers",
|
||||
"Number of persistent layers accessed for processing a read request, including those in the cache",
|
||||
&["tenant_id", "timeline_id"],
|
||||
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value (reconstruct a page from deltas)",
|
||||
@@ -104,7 +102,7 @@ pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"Number of cache hits from materialized page cache without redo",
|
||||
@@ -112,17 +110,16 @@ pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_getpage_get_reconstruct_data_seconds",
|
||||
"Time spent in get_reconstruct_value_data",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"Number of cache hits from materialized page cache",
|
||||
@@ -246,11 +243,10 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
|
||||
},
|
||||
});
|
||||
|
||||
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wait_lsn_seconds",
|
||||
"Time spent waiting for WAL to arrive",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -284,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_remote_ondemand_downloaded_layers_total",
|
||||
"Total on-demand downloaded layers"
|
||||
@@ -292,7 +288,7 @@ pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_remote_ondemand_downloaded_bytes_total",
|
||||
"Total bytes of layers on-demand downloaded",
|
||||
@@ -309,16 +305,29 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define current logical size metric")
|
||||
});
|
||||
|
||||
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_states_count",
|
||||
"Count of tenants per state",
|
||||
&["tenant_id", "state"]
|
||||
&["state"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
/// A set of broken tenants.
|
||||
///
|
||||
/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
|
||||
/// tenant.
|
||||
pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_broken_tenants_count",
|
||||
"Set of broken tenants",
|
||||
&["tenant_id"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_synthetic_cached_size_bytes",
|
||||
"Synthetic size of each tenant in bytes",
|
||||
@@ -376,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_unexpected_ondemand_downloads_count",
|
||||
"Number of unexpected on-demand downloads. \
|
||||
@@ -385,7 +394,7 @@ pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Each [`Timeline`]'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||
#[derive(Debug)]
|
||||
pub struct EvictionsWithLowResidenceDuration {
|
||||
data_source: &'static str,
|
||||
@@ -499,23 +508,31 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
30.000, // 30000 ms
|
||||
];
|
||||
|
||||
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
|
||||
"open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
|
||||
];
|
||||
|
||||
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
|
||||
|
||||
pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
/// Tracks time taken by fs operations near VirtualFile.
|
||||
///
|
||||
/// Operations:
|
||||
/// - open ([`std::fs::OpenOptions::open`])
|
||||
/// - close (dropping [`std::fs::File`])
|
||||
/// - close-by-replace (close by replacement algorithm)
|
||||
/// - read (`read_at`)
|
||||
/// - write (`write_at`)
|
||||
/// - seek (modify internal position or file length query)
|
||||
/// - fsync ([`std::fs::File::sync_all`])
|
||||
/// - metadata ([`std::fs::File::metadata`])
|
||||
pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_io_operations_seconds",
|
||||
"Time spent in IO operations",
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
&["operation"],
|
||||
STORAGE_IO_TIME_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
|
||||
|
||||
// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
|
||||
pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"Total amount of bytes read/written in IO operations",
|
||||
@@ -541,6 +558,17 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// keep in sync with control plane Go code so that we can validate
|
||||
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
|
||||
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
// Go code uses milliseconds. Variable is called `computeStartupBuckets`
|
||||
[
|
||||
5, 10, 20, 30, 50, 70, 100, 120, 150, 200, 250, 300, 350, 400, 450, 500, 600, 800, 1000,
|
||||
1500, 2000, 2500, 3000, 5000, 10000, 20000, 40000, 60000,
|
||||
]
|
||||
.map(|ms| (ms as f64) / 1000.0)
|
||||
});
|
||||
|
||||
pub struct BasebackupQueryTime(HistogramVec);
|
||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
BasebackupQueryTime({
|
||||
@@ -548,7 +576,7 @@ pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
"pageserver_basebackup_query_seconds",
|
||||
"Histogram of basebackup queries durations, by result type",
|
||||
&["result"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
COMPUTE_STARTUP_BUCKETS.to_vec(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
})
|
||||
@@ -594,7 +622,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
|
||||
at a given instant. It gives you a better idea of the queue depth \
|
||||
than plotting the gauge directly, since operations may complete faster \
|
||||
than the sampling interval.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
&["file_kind", "op_kind"],
|
||||
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
|
||||
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
|
||||
)
|
||||
@@ -651,18 +679,18 @@ impl RemoteOpFileKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_operation_seconds",
|
||||
"Time spent on remote storage operations. \
|
||||
Grouped by tenant, timeline, operation_kind and status. \
|
||||
Does not account for time spent waiting in remote timeline client's queues.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
|
||||
&["file_kind", "op_kind", "status"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_task_events",
|
||||
"Number of task start/stop/fail events.",
|
||||
@@ -671,7 +699,7 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_period_overrun_count",
|
||||
"Incremented whenever warn_when_period_overrun() logs a warning.",
|
||||
@@ -682,7 +710,7 @@ pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new
|
||||
|
||||
// walreceiver metrics
|
||||
|
||||
pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_walreceiver_started_connections_total",
|
||||
"Number of started walreceiver connections"
|
||||
@@ -690,7 +718,7 @@ pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_walreceiver_active_managers",
|
||||
"Number of active walreceiver managers"
|
||||
@@ -698,7 +726,7 @@ pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_walreceiver_switches_total",
|
||||
"Number of walreceiver manager change_connection calls",
|
||||
@@ -707,7 +735,7 @@ pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_walreceiver_broker_updates_total",
|
||||
"Number of received broker updates in walreceiver"
|
||||
@@ -715,7 +743,7 @@ pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_walreceiver_candidates_events_total",
|
||||
"Number of walreceiver candidate events",
|
||||
@@ -724,10 +752,10 @@ pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
|
||||
|
||||
pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
|
||||
|
||||
// Metrics collected on WAL redo operations
|
||||
@@ -774,7 +802,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
|
||||
};
|
||||
}
|
||||
|
||||
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_seconds",
|
||||
"Time spent on WAL redo",
|
||||
@@ -783,7 +811,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"Time spent waiting for access to the Postgres WAL redo process",
|
||||
@@ -792,7 +820,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_records_histogram",
|
||||
"Histogram of number of records replayed per redo in the Postgres WAL redo process",
|
||||
@@ -801,7 +829,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_bytes_histogram",
|
||||
"Histogram of number of records replayed per redo sent to Postgres",
|
||||
@@ -810,7 +838,8 @@ pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
|
||||
pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_replayed_wal_records_total",
|
||||
"Number of WAL records replayed in WAL redo process"
|
||||
@@ -818,7 +847,7 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
start: Instant,
|
||||
@@ -876,7 +905,7 @@ impl StorageTimeMetrics {
|
||||
|
||||
/// Starts timing a new operation.
|
||||
///
|
||||
/// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
|
||||
/// Note: unlike `prometheus::HistogramTimer` the returned timer does not record on drop.
|
||||
pub fn start_timer(&self) -> StorageTimeMetricsTimer {
|
||||
StorageTimeMetricsTimer::new(self.clone())
|
||||
}
|
||||
@@ -886,7 +915,6 @@ impl StorageTimeMetrics {
|
||||
pub struct TimelineMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
pub get_reconstruct_data_time_histo: Histogram,
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
@@ -895,9 +923,7 @@ pub struct TimelineMetrics {
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub read_num_fs_layers: Histogram,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
@@ -914,9 +940,6 @@ impl TimelineMetrics {
|
||||
) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let flush_time_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
|
||||
let compact_time_histo =
|
||||
@@ -937,9 +960,6 @@ impl TimelineMetrics {
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let wait_lsn_time_histo = WAIT_LSN_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -955,16 +975,12 @@ impl TimelineMetrics {
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let read_num_fs_layers = READ_NUM_FS_LAYERS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration =
|
||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
get_reconstruct_data_time_histo,
|
||||
flush_time_histo,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
@@ -973,7 +989,6 @@ impl TimelineMetrics {
|
||||
garbage_collect_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
wait_lsn_time_histo,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
num_persistent_files_created,
|
||||
@@ -982,7 +997,6 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
read_num_fs_layers,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -991,15 +1005,12 @@ impl Drop for TimelineMetrics {
|
||||
fn drop(&mut self) {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -1011,9 +1022,6 @@ impl Drop for TimelineMetrics {
|
||||
let _ =
|
||||
STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
for op in STORAGE_IO_TIME_OPERATIONS {
|
||||
let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in STORAGE_IO_SIZE_OPERATIONS {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
@@ -1028,9 +1036,7 @@ impl Drop for TimelineMetrics {
|
||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
let tid = tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
for state in TenantState::VARIANTS {
|
||||
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
|
||||
}
|
||||
// we leave the BROKEN_TENANTS_SET entry if any
|
||||
}
|
||||
|
||||
use futures::Future;
|
||||
@@ -1045,9 +1051,7 @@ pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
|
||||
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
|
||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
}
|
||||
@@ -1057,14 +1061,13 @@ impl RemoteTimelineClientMetrics {
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_id.to_string(),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
remote_operation_time: Mutex::new(HashMap::default()),
|
||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||
calls_started_hist: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
|
||||
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
|
||||
guard
|
||||
@@ -1078,26 +1081,17 @@ impl RemoteTimelineClientMetrics {
|
||||
})
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub fn remote_operation_time(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
status: &'static str,
|
||||
) -> Histogram {
|
||||
let mut guard = self.remote_operation_time.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str(), status);
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
key.2,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[key.0, key.1, key.2])
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn calls_unfinished_gauge(
|
||||
@@ -1125,19 +1119,10 @@ impl RemoteTimelineClientMetrics {
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Histogram {
|
||||
let mut guard = self.calls_started_hist.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[key.0, key.1])
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn bytes_started_counter(
|
||||
@@ -1256,7 +1241,7 @@ impl RemoteTimelineClientMetrics {
|
||||
/// Update the metrics that change when a call to the remote timeline client instance starts.
|
||||
///
|
||||
/// Drop the returned guard object once the operation is finished to updates corresponding metrics that track completions.
|
||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`] if that
|
||||
/// Or, use [`RemoteTimelineClientCallMetricGuard::will_decrement_manually`] and [`call_end`](Self::call_end) if that
|
||||
/// is more suitable.
|
||||
/// Never do both.
|
||||
pub(crate) fn call_begin(
|
||||
@@ -1289,7 +1274,7 @@ impl RemoteTimelineClientMetrics {
|
||||
|
||||
/// Manually udpate the metrics that track completions, instead of using the guard object.
|
||||
/// Using the guard object is generally preferable.
|
||||
/// See [`call_begin`] for more context.
|
||||
/// See [`call_begin`](Self::call_begin) for more context.
|
||||
pub(crate) fn call_end(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
@@ -1317,15 +1302,10 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
remote_physical_size_gauge,
|
||||
remote_operation_time,
|
||||
calls_unfinished_gauge,
|
||||
calls_started_hist,
|
||||
bytes_started_counter,
|
||||
bytes_finished_counter,
|
||||
} = self;
|
||||
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
|
||||
}
|
||||
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
||||
tenant_id,
|
||||
@@ -1334,14 +1314,6 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
||||
tenant_id,
|
||||
@@ -1423,15 +1395,51 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// We want to alert on this metric increasing.
|
||||
// Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
|
||||
assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
|
||||
UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
// FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
|
||||
// order:
|
||||
// - global metrics reside in a Lazy<PageserverMetrics>
|
||||
// - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
|
||||
// - could move the statics into TimelineMetrics::new()?
|
||||
|
||||
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
|
||||
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
|
||||
// counters
|
||||
[
|
||||
&MATERIALIZED_PAGE_CACHE_HIT,
|
||||
&MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
&UNEXPECTED_ONDEMAND_DOWNLOADS,
|
||||
&WALRECEIVER_STARTED_CONNECTIONS,
|
||||
&WALRECEIVER_BROKER_UPDATES,
|
||||
&WALRECEIVER_CANDIDATES_ADDED,
|
||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
|
||||
// Python tests need these.
|
||||
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
|
||||
MATERIALIZED_PAGE_CACHE_HIT.get();
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
|
||||
// gauges
|
||||
WALRECEIVER_ACTIVE_MANAGERS.get();
|
||||
|
||||
// histograms
|
||||
[
|
||||
&READ_NUM_FS_LAYERS,
|
||||
&RECONSTRUCT_TIME,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
&WAL_REDO_WAIT_TIME,
|
||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||
&WAL_REDO_BYTES_HISTOGRAM,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|h| {
|
||||
Lazy::force(h);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -1131,7 +1131,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// context, breaking the atomicity is OK. If the import is interrupted, the
|
||||
/// whole import fails and the timeline will be deleted anyway.
|
||||
/// (Or to be precise, it will be left behind for debugging purposes and
|
||||
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
|
||||
/// ignored, see <https://github.com/neondatabase/neon/pull/1809>)
|
||||
///
|
||||
/// Note: A consequence of flushing the pending operations is that they
|
||||
/// won't be visible to subsequent operations until `commit`. The function
|
||||
|
||||
@@ -130,11 +130,25 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("background op worker")
|
||||
// if you change the number of worker threads please change the constant below
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create background op runtime")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
|
||||
// force init and thus panics
|
||||
let _ = BACKGROUND_RUNTIME.handle();
|
||||
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
|
||||
// tokio would had already panicked for parsing errors or NotUnicode
|
||||
//
|
||||
// this will be wrong if any of the runtimes gets their worker threads configured to something
|
||||
// else, but that has not been needed in a long time.
|
||||
std::env::var("TOKIO_WORKER_THREADS")
|
||||
.map(|s| s.parse::<usize>().unwrap())
|
||||
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
@@ -205,7 +219,7 @@ pub enum TaskKind {
|
||||
///
|
||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||
/// That abstraction doesn't use `task_mgr`.
|
||||
/// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
|
||||
/// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task.
|
||||
/// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
|
||||
///
|
||||
/// Once the connection is established, the `TaskHandle` task creates a
|
||||
@@ -213,16 +227,21 @@ pub enum TaskKind {
|
||||
/// the `Connection` object.
|
||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||
/// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||
///
|
||||
/// [`WalReceiverConnectionHandler`]: Self::WalReceiverConnectionHandler
|
||||
/// [`WalReceiverConnectionPoller`]: Self::WalReceiverConnectionPoller
|
||||
WalReceiverManager,
|
||||
|
||||
/// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
|
||||
/// The `TaskHandle` task that executes `handle_walreceiver_connection`.
|
||||
/// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
///
|
||||
/// [`WalReceiverManager`]: Self::WalReceiverManager
|
||||
WalReceiverConnectionHandler,
|
||||
|
||||
/// The task that polls the `tokio-postgres::Connection` object.
|
||||
/// Spawned by task [`WalReceiverConnectionHandler`].
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
/// Spawned by task [`WalReceiverConnectionHandler`](Self::WalReceiverConnectionHandler).
|
||||
/// See the comment on [`WalReceiverManager`](Self::WalReceiverManager).
|
||||
WalReceiverConnectionPoller,
|
||||
|
||||
// Garbage collection worker. One per tenant
|
||||
@@ -506,17 +525,13 @@ pub async fn shutdown_tasks(
|
||||
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
}
|
||||
let join_handle = tokio::select! {
|
||||
biased;
|
||||
_ = &mut join_handle => { None },
|
||||
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
Some(join_handle)
|
||||
}
|
||||
};
|
||||
if let Some(join_handle) = join_handle {
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
// we never handled this return value, but:
|
||||
// - we don't deschedule which would lead to is_cancelled
|
||||
// - panics are already logged (is_panicked)
|
||||
@@ -544,7 +559,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
|
||||
pub async fn shutdown_watcher() {
|
||||
let token = SHUTDOWN_TOKEN
|
||||
.try_with(|t| t.clone())
|
||||
.expect("shutdown_requested() called in an unexpected task or thread");
|
||||
.expect("shutdown_watcher() called in an unexpected task or thread");
|
||||
|
||||
token.cancelled().await;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -16,30 +16,20 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
/// For reading
|
||||
pub trait BlobCursor {
|
||||
impl<R> BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
/// Read a blob into a new buffer.
|
||||
fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_blob_into_buf(offset, &mut buf)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
/// Read blob into the given buffer. Any previous contents in the buffer
|
||||
/// are overwritten.
|
||||
fn read_blob_into_buf(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error>;
|
||||
}
|
||||
|
||||
impl<R> BlobCursor for BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
fn read_blob_into_buf(
|
||||
&mut self,
|
||||
pub fn read_blob_into_buf(
|
||||
&self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
|
||||
@@ -80,7 +80,7 @@ where
|
||||
BlockCursor { reader }
|
||||
}
|
||||
|
||||
pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
|
||||
pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
|
||||
self.reader.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -390,39 +390,42 @@ where
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
self.dump_recurse(self.root_blk, &[], 0)
|
||||
}
|
||||
pub async fn dump(&self) -> Result<()> {
|
||||
let mut stack = Vec::new();
|
||||
|
||||
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
stack.push((self.root_blk, String::new(), 0, 0, 0));
|
||||
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
|
||||
print!("{:indent$}", "", indent = depth * 2);
|
||||
println!(
|
||||
"blk #{}: path {}: prefix {}, suffix_len {}",
|
||||
blknum,
|
||||
hex::encode(path),
|
||||
hex::encode(node.prefix),
|
||||
node.suffix_len
|
||||
);
|
||||
if child_idx == 0 {
|
||||
print!("{:indent$}", "", indent = depth * 2);
|
||||
let path_prefix = stack
|
||||
.iter()
|
||||
.map(|(_blknum, path, ..)| path.as_str())
|
||||
.collect::<String>();
|
||||
println!(
|
||||
"blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
|
||||
hex::encode(node.prefix),
|
||||
node.suffix_len
|
||||
);
|
||||
}
|
||||
|
||||
let mut idx = 0;
|
||||
let mut key_off = 0;
|
||||
while idx < node.num_children {
|
||||
if child_idx + 1 < node.num_children {
|
||||
let key_off = key_off + node.suffix_len as usize;
|
||||
stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
|
||||
}
|
||||
let key = &node.keys[key_off..key_off + node.suffix_len as usize];
|
||||
let val = node.value(idx as usize);
|
||||
let val = node.value(child_idx as usize);
|
||||
|
||||
print!("{:indent$}", "", indent = depth * 2 + 2);
|
||||
println!("{}: {}", hex::encode(key), hex::encode(val.0));
|
||||
|
||||
if node.level > 0 {
|
||||
let child_path = [path, node.prefix].concat();
|
||||
self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
|
||||
stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
|
||||
}
|
||||
idx += 1;
|
||||
key_off += node.suffix_len as usize;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -442,7 +445,7 @@ where
|
||||
writer: W,
|
||||
|
||||
///
|
||||
/// stack[0] is the current root page, stack.last() is the leaf.
|
||||
/// `stack[0]` is the current root page, `stack.last()` is the leaf.
|
||||
///
|
||||
/// We maintain the length of the stack to be always greater than zero.
|
||||
/// Two exceptions are:
|
||||
@@ -754,8 +757,8 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() -> Result<()> {
|
||||
#[tokio::test]
|
||||
async fn basic() -> Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
|
||||
|
||||
@@ -775,7 +778,7 @@ mod tests {
|
||||
|
||||
let reader = DiskBtreeReader::new(0, root_offset, disk);
|
||||
|
||||
reader.dump()?;
|
||||
reader.dump().await?;
|
||||
|
||||
// Test the `get` function on all the keys.
|
||||
for (key, val) in all_data.iter() {
|
||||
@@ -835,8 +838,8 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lots_of_keys() -> Result<()> {
|
||||
#[tokio::test]
|
||||
async fn lots_of_keys() -> Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
|
||||
|
||||
@@ -856,7 +859,7 @@ mod tests {
|
||||
|
||||
let reader = DiskBtreeReader::new(0, root_offset, disk);
|
||||
|
||||
reader.dump()?;
|
||||
reader.dump().await?;
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
@@ -994,8 +997,8 @@ mod tests {
|
||||
///
|
||||
/// This test contains a particular data set, see disk_btree_test_data.rs
|
||||
///
|
||||
#[test]
|
||||
fn particular_data() -> Result<()> {
|
||||
#[tokio::test]
|
||||
async fn particular_data() -> Result<()> {
|
||||
// Build a tree from it
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
|
||||
@@ -1022,7 +1025,7 @@ mod tests {
|
||||
})?;
|
||||
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
|
||||
|
||||
reader.dump()?;
|
||||
reader.dump().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -328,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use rand::{seq::SliceRandom, thread_rng, RngCore};
|
||||
use std::fs;
|
||||
@@ -420,7 +420,7 @@ mod tests {
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
let cursor = BlockCursor::new(&file);
|
||||
for (pos, expected) in blobs {
|
||||
let actual = cursor.read_blob(pos)?;
|
||||
assert_eq!(actual, expected);
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
//! Other read methods are less critical but still impact performance of background tasks.
|
||||
//!
|
||||
//! This data structure relies on a persistent/immutable binary search tree. See the
|
||||
//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
||||
//! following lecture for an introduction <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
|
||||
//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
|
||||
//! you to modify the tree in such a way that each modification creates a new "version"
|
||||
//! of the tree. When you modify it, you get a new version, but all previous versions are
|
||||
@@ -40,7 +40,7 @@
|
||||
//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
|
||||
//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
|
||||
//! to throw away most of the persistent BST and build a new one, starting from the oldest
|
||||
//! LSN. See `LayerMap::flush_updates()`.
|
||||
//! LSN. See [`LayerMap::flush_updates()`].
|
||||
//!
|
||||
|
||||
mod historic_layer_coverage;
|
||||
@@ -626,17 +626,17 @@ impl LayerMap {
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
|
||||
println!("open_layer:");
|
||||
if let Some(open_layer) = &self.open_layer {
|
||||
open_layer.dump(verbose, ctx)?;
|
||||
open_layer.dump(verbose, ctx).await?;
|
||||
}
|
||||
|
||||
println!("frozen_layers:");
|
||||
for frozen_layer in self.frozen_layers.iter() {
|
||||
frozen_layer.dump(verbose, ctx)?;
|
||||
frozen_layer.dump(verbose, ctx).await?;
|
||||
}
|
||||
|
||||
println!("historic_layers:");
|
||||
|
||||
@@ -122,8 +122,7 @@ impl<Value: Clone> HistoricLayerCoverage<Value> {
|
||||
self.head = self
|
||||
.historic
|
||||
.iter()
|
||||
.rev()
|
||||
.next()
|
||||
.next_back()
|
||||
.map(|(_, v)| v.clone())
|
||||
.unwrap_or_default();
|
||||
}
|
||||
@@ -412,7 +411,7 @@ fn test_persistent_overlapping() {
|
||||
/// still be more critical.
|
||||
///
|
||||
/// See this for more on persistent and retroactive techniques:
|
||||
/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
||||
/// <https://www.youtube.com/watch?v=WqCWghETNDc&t=581s>
|
||||
pub struct BufferedHistoricLayerCoverage<Value> {
|
||||
/// A persistent layer map that we rebuild when we need to retroactively update
|
||||
historic_coverage: HistoricLayerCoverage<Value>,
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::ops::Range;
|
||||
|
||||
// NOTE the `im` crate has 20x more downloads and also has
|
||||
// persistent/immutable BTree. But it's bugged so rpds is a
|
||||
// better choice https://github.com/neondatabase/neon/issues/3395
|
||||
// better choice <https://github.com/neondatabase/neon/issues/3395>
|
||||
use rpds::RedBlackTreeMapSync;
|
||||
|
||||
/// Data structure that can efficiently:
|
||||
@@ -11,7 +11,7 @@ use rpds::RedBlackTreeMapSync;
|
||||
/// - insert layers in non-decreasing lsn.start order
|
||||
///
|
||||
/// For a detailed explanation and justification of this approach, see:
|
||||
/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
|
||||
/// <https://neon.tech/blog/persistent-structures-in-neons-wal-indexing>
|
||||
///
|
||||
/// NOTE The struct is parameterized over Value for easier
|
||||
/// testing, but in practice it's some sort of layer.
|
||||
@@ -113,8 +113,7 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
pub fn query(&self, key: i128) -> Option<Value> {
|
||||
self.nodes
|
||||
.range(..=key)
|
||||
.rev()
|
||||
.next()?
|
||||
.next_back()?
|
||||
.1
|
||||
.as_ref()
|
||||
.map(|(_, v)| v.clone())
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
//! Currently, this is not used in the system. Future refactors will ensure
|
||||
//! the storage state will be recorded in this file, and the system can be
|
||||
//! recovered from this file. This is tracked in
|
||||
//! https://github.com/neondatabase/neon/issues/4418
|
||||
//! <https://github.com/neondatabase/neon/issues/4418>
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
|
||||
@@ -1,16 +1,19 @@
|
||||
//! Every image of a certain timeline from [`crate::tenant::Tenant`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
|
||||
//! Later, the file gets used in [`remote_timeline_client`] as a part of
|
||||
//! external storage import and export operations.
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
//!
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
use tracing::info_span;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::{
|
||||
@@ -265,24 +268,24 @@ pub fn save_metadata(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum LoadMetadataError {
|
||||
#[error(transparent)]
|
||||
Read(#[from] io::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
Decode(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
) -> Result<TimelineMetadata, LoadMetadataError> {
|
||||
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
|
||||
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})?;
|
||||
TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})
|
||||
let metadata_bytes = std::fs::read(metadata_path)?;
|
||||
|
||||
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -26,6 +26,8 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::timeline::delete::DeleteTimelineFlow;
|
||||
|
||||
/// The tenants known to the pageserver.
|
||||
/// The enum variants are used to distinguish the different states that the pageserver can be in.
|
||||
enum TenantsMap {
|
||||
@@ -233,11 +235,17 @@ pub fn schedule_local_tenant_processing(
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument]
|
||||
#[instrument(skip_all)]
|
||||
pub async fn shutdown_all_tenants() {
|
||||
shutdown_all_tenants0(&TENANTS).await
|
||||
}
|
||||
|
||||
async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
|
||||
use utils::completion;
|
||||
|
||||
// Prevent new tenants from being created.
|
||||
let tenants_to_shut_down = {
|
||||
let mut m = TENANTS.write().await;
|
||||
let mut m = tenants.write().await;
|
||||
match &mut *m {
|
||||
TenantsMap::Initializing => {
|
||||
*m = TenantsMap::ShuttingDown(HashMap::default());
|
||||
@@ -262,14 +270,41 @@ pub async fn shutdown_all_tenants() {
|
||||
for (tenant_id, tenant) in tenants_to_shut_down {
|
||||
join_set.spawn(
|
||||
async move {
|
||||
let freeze_and_flush = true;
|
||||
// ordering shouldn't matter for this, either we store true right away or never
|
||||
let ordering = std::sync::atomic::Ordering::Relaxed;
|
||||
let joined_other = std::sync::atomic::AtomicBool::new(false);
|
||||
|
||||
match tenant.shutdown(freeze_and_flush).await {
|
||||
Ok(()) => debug!("tenant successfully stopped"),
|
||||
Err(super::ShutdownError::AlreadyStopping) => {
|
||||
warn!("tenant was already shutting down")
|
||||
let mut shutdown = std::pin::pin!(async {
|
||||
let freeze_and_flush = true;
|
||||
|
||||
let res = {
|
||||
let (_guard, shutdown_progress) = completion::channel();
|
||||
tenant.shutdown(shutdown_progress, freeze_and_flush).await
|
||||
};
|
||||
|
||||
if let Err(other_progress) = res {
|
||||
// join the another shutdown in progress
|
||||
joined_other.store(true, ordering);
|
||||
other_progress.wait().await;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// in practice we might not have a lot time to go, since systemd is going to
|
||||
// SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
|
||||
// a warning.
|
||||
let warning = std::time::Duration::from_secs(5);
|
||||
let mut warning = std::pin::pin!(tokio::time::sleep(warning));
|
||||
|
||||
tokio::select! {
|
||||
_ = &mut shutdown => {},
|
||||
_ = &mut warning => {
|
||||
let joined_other = joined_other.load(ordering);
|
||||
warn!(%joined_other, "waiting for the shutdown to complete");
|
||||
shutdown.await;
|
||||
}
|
||||
};
|
||||
|
||||
debug!("tenant successfully stopped");
|
||||
}
|
||||
.instrument(info_span!("shutdown", %tenant_id)),
|
||||
);
|
||||
@@ -388,12 +423,10 @@ pub enum DeleteTimelineError {
|
||||
pub async fn delete_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
tenant
|
||||
.prepare_and_schedule_delete_timeline(timeline_id, ctx)
|
||||
.await?;
|
||||
DeleteTimelineFlow::run(&tenant, timeline_id).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -413,6 +446,15 @@ pub async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
detach_ignored: bool,
|
||||
) -> Result<(), TenantStateError> {
|
||||
detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
|
||||
}
|
||||
|
||||
async fn detach_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
detach_ignored: bool,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||
@@ -425,7 +467,8 @@ pub async fn detach_tenant(
|
||||
};
|
||||
|
||||
let removal_result =
|
||||
remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
|
||||
remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
|
||||
.await;
|
||||
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
@@ -472,7 +515,15 @@ pub async fn ignore_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
ignore_tenant0(conf, &TENANTS, tenant_id).await
|
||||
}
|
||||
|
||||
async fn ignore_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
remove_tenant_from_memory(tenants, tenant_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
fs::File::create(&ignore_mark_file)
|
||||
.await
|
||||
@@ -597,18 +648,21 @@ where
|
||||
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
|
||||
/// operation would be needed to remove it.
|
||||
async fn remove_tenant_from_memory<V, F>(
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
tenant_cleanup: F,
|
||||
) -> Result<V, TenantStateError>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
use utils::completion;
|
||||
|
||||
// It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
|
||||
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
||||
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
||||
// avoid holding the lock for the entire process.
|
||||
let tenant = {
|
||||
TENANTS
|
||||
tenants
|
||||
.write()
|
||||
.await
|
||||
.get(&tenant_id)
|
||||
@@ -616,14 +670,20 @@ where
|
||||
.ok_or(TenantStateError::NotFound(tenant_id))?
|
||||
};
|
||||
|
||||
// allow pageserver shutdown to await for our completion
|
||||
let (_guard, progress) = completion::channel();
|
||||
|
||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||
let freeze_and_flush = false;
|
||||
|
||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||
// that we can continue safely to cleanup.
|
||||
match tenant.shutdown(freeze_and_flush).await {
|
||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
||||
Ok(()) => {}
|
||||
Err(super::ShutdownError::AlreadyStopping) => {
|
||||
return Err(TenantStateError::IsStopping(tenant_id))
|
||||
Err(_other) => {
|
||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||
// wait for it but return an error right away because these are distinct requests.
|
||||
return Err(TenantStateError::IsStopping(tenant_id));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -632,14 +692,14 @@ where
|
||||
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
||||
{
|
||||
Ok(hook_value) => {
|
||||
let mut tenants_accessor = TENANTS.write().await;
|
||||
let mut tenants_accessor = tenants.write().await;
|
||||
if tenants_accessor.remove(&tenant_id).is_none() {
|
||||
warn!("Tenant {tenant_id} got removed from memory before operation finished");
|
||||
}
|
||||
Ok(hook_value)
|
||||
}
|
||||
Err(e) => {
|
||||
let tenants_accessor = TENANTS.read().await;
|
||||
let tenants_accessor = tenants.read().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => {
|
||||
tenant.set_broken(e.to_string()).await;
|
||||
@@ -708,51 +768,108 @@ pub async fn immediate_gc(
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
pub async fn immediate_compact(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
|
||||
let guard = TENANTS.read().await;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{info_span, Instrument};
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
use super::{super::harness::TenantHarness, TenantsMap};
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn shutdown_joins_remove_tenant_from_memory() {
|
||||
// the test is a bit ugly with the lockstep together with spawned tasks. the aim is to make
|
||||
// sure `shutdown_all_tenants0` per-tenant processing joins in any active
|
||||
// remove_tenant_from_memory calls, which is enforced by making the operation last until
|
||||
// we've ran `shutdown_all_tenants0` for a long time.
|
||||
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::Compaction,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!(
|
||||
"timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
|
||||
),
|
||||
false,
|
||||
async move {
|
||||
let result = timeline
|
||||
.compact(&ctx)
|
||||
.instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
|
||||
.await;
|
||||
let (t, _ctx) = TenantHarness::create("shutdown_joins_detach")
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send compaction result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
// harness loads it to active, which is forced and nothing is running on the tenant
|
||||
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
let id = t.tenant_id();
|
||||
|
||||
Ok(wait_task_done)
|
||||
// tenant harness configures the logging and we cannot escape it
|
||||
let _e = info_span!("testing", tenant_id = %id).entered();
|
||||
|
||||
let tenants = HashMap::from([(id, t.clone())]);
|
||||
let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
|
||||
|
||||
let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
|
||||
let (until_cleanup_started, cleanup_started) = utils::completion::channel();
|
||||
|
||||
// start a "detaching operation", which will take a while, until can_complete_cleanup
|
||||
let cleanup_task = {
|
||||
let jh = tokio::spawn({
|
||||
let tenants = tenants.clone();
|
||||
async move {
|
||||
let cleanup = async move {
|
||||
drop(until_cleanup_started);
|
||||
can_complete_cleanup.wait().await;
|
||||
anyhow::Ok(())
|
||||
};
|
||||
super::remove_tenant_from_memory(&tenants, id, cleanup).await
|
||||
}
|
||||
.instrument(info_span!("foobar", tenant_id = %id))
|
||||
});
|
||||
|
||||
// now the long cleanup should be in place, with the stopping state
|
||||
cleanup_started.wait().await;
|
||||
jh
|
||||
};
|
||||
|
||||
let mut cleanup_progress = std::pin::pin!(t
|
||||
.shutdown(utils::completion::Barrier::default(), false)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.wait());
|
||||
|
||||
let mut shutdown_task = {
|
||||
let (until_shutdown_started, shutdown_started) = utils::completion::channel();
|
||||
|
||||
let shutdown_task = tokio::spawn(async move {
|
||||
drop(until_shutdown_started);
|
||||
super::shutdown_all_tenants0(&tenants).await;
|
||||
});
|
||||
|
||||
shutdown_started.wait().await;
|
||||
shutdown_task
|
||||
};
|
||||
|
||||
// if the joining in is removed from shutdown_all_tenants0, the shutdown_task should always
|
||||
// get to complete within timeout and fail the test. it is expected to continue awaiting
|
||||
// until completion or SIGKILL during normal shutdown.
|
||||
//
|
||||
// the timeout is long to cover anything that shutdown_task could be doing, but it is
|
||||
// handled instantly because we use tokio's time pausing in this test. 100s is much more than
|
||||
// what we get from systemd on shutdown (10s).
|
||||
let long_time = std::time::Duration::from_secs(100);
|
||||
tokio::select! {
|
||||
_ = &mut shutdown_task => unreachable!("shutdown must continue, until_cleanup_completed is not dropped"),
|
||||
_ = &mut cleanup_progress => unreachable!("cleanup progress must continue, until_cleanup_completed is not dropped"),
|
||||
_ = tokio::time::sleep(long_time) => {},
|
||||
}
|
||||
|
||||
// allow the remove_tenant_from_memory and thus eventually the shutdown to continue
|
||||
drop(until_cleanup_completed);
|
||||
|
||||
let (je, ()) = tokio::join!(shutdown_task, cleanup_progress);
|
||||
je.expect("Tenant::shutdown shutdown not have panicked");
|
||||
cleanup_task
|
||||
.await
|
||||
.expect("no panicking")
|
||||
.expect("remove_tenant_from_memory failed");
|
||||
|
||||
futures::future::poll_immediate(
|
||||
t.shutdown(utils::completion::Barrier::default(), false)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.wait(),
|
||||
)
|
||||
.await
|
||||
.expect("the stopping progress must still be complete");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +135,7 @@
|
||||
//! - Initiate upload queue with that [`IndexPart`].
|
||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||
//! and remote state as per [`IndexPart`]. This is done in
|
||||
//! [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//!
|
||||
//! Note that if we crash during file deletion between the index update
|
||||
//! that removes the file from the list of files, and deleting the remote file,
|
||||
@@ -163,8 +163,8 @@
|
||||
//! - download their remote [`IndexPart`]s
|
||||
//! - create `Timeline` struct and a `RemoteTimelineClient`
|
||||
//! - initialize the client's upload queue with its `IndexPart`
|
||||
//! - create [`RemoteLayer`] instances for layers that are referenced by `IndexPart`
|
||||
//! but not present locally
|
||||
//! - create [`RemoteLayer`](super::storage_layer::RemoteLayer) instances
|
||||
//! for layers that are referenced by `IndexPart` but not present locally
|
||||
//! - schedule uploads for layers that are only present locally.
|
||||
//! - if the remote `IndexPart`'s metadata was newer than the metadata in
|
||||
//! the local filesystem, write the remote metadata to the local filesystem
|
||||
@@ -198,6 +198,8 @@
|
||||
//! in remote storage.
|
||||
//! But note that we don't test any of this right now.
|
||||
//!
|
||||
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
|
||||
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
|
||||
|
||||
mod delete;
|
||||
mod download;
|
||||
@@ -512,7 +514,7 @@ impl RemoteTimelineClient {
|
||||
/// updated metadata.
|
||||
///
|
||||
/// The upload will be added to the queue immediately, but it
|
||||
/// won't be performed until all previosuly scheduled layer file
|
||||
/// won't be performed until all previously scheduled layer file
|
||||
/// upload operations have completed successfully. This is to
|
||||
/// ensure that when the index file claims that layers X, Y and Z
|
||||
/// exist in remote storage, they really do. To wait for the upload
|
||||
@@ -623,7 +625,7 @@ impl RemoteTimelineClient {
|
||||
/// Note: This schedules an index file upload before the deletions. The
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// succesfully.
|
||||
/// successfully.
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
@@ -825,7 +827,7 @@ impl RemoteTimelineClient {
|
||||
)
|
||||
};
|
||||
|
||||
receiver.changed().await?;
|
||||
receiver.changed().await.context("upload queue shut down")?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
@@ -840,23 +842,37 @@ impl RemoteTimelineClient {
|
||||
let remaining: Vec<RemotePath> = remaining
|
||||
.into_iter()
|
||||
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
|
||||
.inspect(|path| {
|
||||
if let Some(name) = path.object_name() {
|
||||
info!(%name, "deleting a file not referenced from index_part.json");
|
||||
} else {
|
||||
warn!(%path, "deleting a nameless or non-utf8 object not referenced from index_part.json");
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !remaining.is_empty() {
|
||||
warn!(
|
||||
"Found {} files not bound to index_file.json, proceeding with their deletion",
|
||||
remaining.len()
|
||||
);
|
||||
warn!("About to remove {} files", remaining.len());
|
||||
self.storage_impl.delete_objects(&remaining).await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-delete"
|
||||
))?
|
||||
});
|
||||
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("deleting index part");
|
||||
self.storage_impl.delete(&index_file_path).await?;
|
||||
|
||||
info!(deletions_queued, "done deleting, including index_part.json");
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-after-index-delete"
|
||||
))?
|
||||
});
|
||||
|
||||
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1101,7 +1117,7 @@ impl RemoteTimelineClient {
|
||||
debug!("remote task {} completed successfully", task.op);
|
||||
}
|
||||
|
||||
// The task has completed succesfully. Remove it from the in-progress list.
|
||||
// The task has completed successfully. Remove it from the in-progress list.
|
||||
{
|
||||
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match upload_queue_guard.deref_mut() {
|
||||
|
||||
@@ -223,6 +223,45 @@ mod tests {
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v2_indexpart_is_parsed_with_deleted_at() {
|
||||
let example = r#"{
|
||||
"version":2,
|
||||
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
|
||||
"missing_layers":["This shouldn't fail deserialization"],
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
|
||||
"deleted_at": "2023-07-31T09:00:00.123"
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
|
||||
version: 2,
|
||||
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
|
||||
file_size: 25600000,
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
|
||||
// serde_json should always parse this but this might be a double with jq for
|
||||
// example.
|
||||
file_size: 9007199254741001,
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
|
||||
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
|
||||
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
|
||||
};
|
||||
|
||||
let part = serde_json::from_str::<IndexPart>(example).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_layers_are_parsed() {
|
||||
let empty_layers_json = r#"{
|
||||
|
||||
@@ -62,12 +62,11 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
let source_file = match source_file_res {
|
||||
Ok(source_file) => source_file,
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// In some situations we might run into the underlying file being deleted by
|
||||
// e.g. compaction before the uploader gets to it. In that instance, we don't
|
||||
// want to retry the error: a deleted file won't come back. In theory, the
|
||||
// file might not have been written in the first place, which also indicates
|
||||
// a bug. Still log the situation so that we can keep an eye on it.
|
||||
// See https://github.com/neondatabase/neon/issues/4526
|
||||
// If we encounter this arm, it wasn't intended, but it's also not
|
||||
// a big problem, if it's because the file was deleted before an
|
||||
// upload. However, a nonexistent file can also be indicative of
|
||||
// something worse, like when a file is scheduled for upload before
|
||||
// it has been written to disk yet.
|
||||
info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -110,11 +110,11 @@ pub struct TimelineInputs {
|
||||
///
|
||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||
/// is updated on-demand, during the start of this calculation and separate from the
|
||||
/// [`Timeline::latest_gc_cutoff`].
|
||||
/// [`TimelineInputs::latest_gc_cutoff`].
|
||||
///
|
||||
/// For timelines in general:
|
||||
///
|
||||
/// ```ignore
|
||||
/// ```text
|
||||
/// 0-----|---------|----|------------| · · · · · |·> lsn
|
||||
/// initdb_lsn branchpoints* next_gc_cutoff latest
|
||||
/// ```
|
||||
|
||||
@@ -11,10 +11,7 @@ pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<
|
||||
#[cfg(debug_assertions)]
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {
|
||||
if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
|
||||
panic!(
|
||||
"missing extractors: {:?}",
|
||||
missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
|
||||
)
|
||||
if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
|
||||
panic!("missing extractors: {missing:?}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ mod remote_layer;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::repository::Key;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
@@ -34,7 +34,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
|
||||
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
pub use image_layer::{ImageLayer, ImageLayerWriter};
|
||||
pub use inmemory_layer::InMemoryLayer;
|
||||
@@ -162,6 +162,9 @@ impl LayerAccessStats {
|
||||
/// The caller is responsible for recording a residence event
|
||||
/// using [`record_residence_event`] before calling `latest_activity`.
|
||||
/// If they don't, [`latest_activity`] will return `None`.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
/// [`latest_activity`]: Self::latest_activity
|
||||
pub(crate) fn empty_will_record_residence_event_later() -> Self {
|
||||
LayerAccessStats(Mutex::default())
|
||||
}
|
||||
@@ -169,6 +172,9 @@ impl LayerAccessStats {
|
||||
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
///
|
||||
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn for_loading_layer(
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
status: LayerResidenceStatus,
|
||||
@@ -187,6 +193,8 @@ impl LayerAccessStats {
|
||||
/// The `new_status` is not recorded in `self`.
|
||||
///
|
||||
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn clone_for_residence_change(
|
||||
&self,
|
||||
layer_map_lock_held_witness: &LayerManager,
|
||||
@@ -294,11 +302,13 @@ impl LayerAccessStats {
|
||||
/// implementation error. This function logs a rate-limited warning in that case.
|
||||
///
|
||||
/// TODO: use type system to avoid the need for `fallback`.
|
||||
/// The approach in https://github.com/neondatabase/neon/pull/3775
|
||||
/// The approach in <https://github.com/neondatabase/neon/pull/3775>
|
||||
/// could be used to enforce that a residence event is recorded
|
||||
/// before a layer is added to the layer map. We could also have
|
||||
/// a layer wrapper type that holds the LayerAccessStats, and ensure
|
||||
/// that that type can only be produced by inserting into the layer map.
|
||||
///
|
||||
/// [`record_residence_event`]: Self::record_residence_event
|
||||
pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
|
||||
let locked = self.0.lock().unwrap();
|
||||
let inner = &locked.for_eviction_policy;
|
||||
@@ -323,12 +333,13 @@ impl LayerAccessStats {
|
||||
}
|
||||
|
||||
/// Supertrait of the [`Layer`] trait that captures the bare minimum interface
|
||||
/// required by [`LayerMap`].
|
||||
/// required by [`LayerMap`](super::layer_map::LayerMap).
|
||||
///
|
||||
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
||||
/// timeline names, because those are known in the context of which the layers
|
||||
/// are used in (timeline).
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
@@ -358,7 +369,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -367,15 +378,9 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Returned by [`Layer::iter`]
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
|
||||
|
||||
/// Returned by [`Layer::key_iter`]
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
pub trait AsLayerDesc {
|
||||
/// Get the layer descriptor.
|
||||
@@ -416,15 +421,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
// `None` for `RemoteLayer`.
|
||||
fn local_path(&self) -> Option<PathBuf>;
|
||||
|
||||
/// Iterate through all keys and values stored in the layer
|
||||
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
|
||||
|
||||
/// Iterate through all keys stored in the layer. Returns key, lsn and value size
|
||||
/// It is used only for compaction and so is currently implemented only for DeltaLayer
|
||||
fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
panic!("Not implemented")
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete_resident_layer_file(&self) -> Result<()>;
|
||||
|
||||
@@ -432,6 +428,10 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
None
|
||||
}
|
||||
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
None
|
||||
}
|
||||
|
||||
fn is_remote_layer(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
@@ -7,14 +7,18 @@
|
||||
//! must be page images or WAL records with the 'will_init' flag set, so that
|
||||
//! they can be replayed without referring to an older page version.
|
||||
//!
|
||||
//! The delta files are stored in timelines/<timeline_id> directory. Currently,
|
||||
//! The delta files are stored in `timelines/<timeline_id>` directory. Currently,
|
||||
//! there are no subdirectories, and each delta file is named like this:
|
||||
//!
|
||||
//! ```text
|
||||
//! <key start>-<key end>__<start LSN>-<end LSN>
|
||||
//! ```
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! ```text
|
||||
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
//! ```
|
||||
//!
|
||||
//! Every delta file consists of three parts: "summary", "index", and
|
||||
//! "values". The summary is a fixed size header at the beginning of the file,
|
||||
@@ -27,7 +31,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -47,6 +51,7 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -56,8 +61,8 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, PathOrConf, PersistentLayerDesc,
|
||||
AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
|
||||
PersistentLayerDesc,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -184,7 +189,7 @@ pub struct DeltaLayer {
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: OnceCell<DeltaLayerInner>,
|
||||
inner: OnceCell<Arc<DeltaLayerInner>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayer {
|
||||
@@ -218,9 +223,10 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -250,12 +256,12 @@ impl Layer for DeltaLayer {
|
||||
file,
|
||||
);
|
||||
|
||||
tree_reader.dump()?;
|
||||
tree_reader.dump().await?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
let cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
@@ -295,7 +301,7 @@ impl Layer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -337,7 +343,7 @@ impl Layer for DeltaLayer {
|
||||
})?;
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
let cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
@@ -410,27 +416,14 @@ impl AsLayerDesc for DeltaLayer {
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.context("load delta layer")?;
|
||||
Ok(match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
Err(err) => Box::new(std::iter::once(Err(err))),
|
||||
})
|
||||
}
|
||||
|
||||
fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
|
||||
Ok(Box::new(
|
||||
DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
@@ -500,7 +493,11 @@ impl DeltaLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
|
||||
fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
// Quick exit if already loaded
|
||||
@@ -509,7 +506,7 @@ impl DeltaLayer {
|
||||
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
|
||||
}
|
||||
|
||||
fn load_inner(&self) -> Result<DeltaLayerInner> {
|
||||
fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let file = VirtualFile::open(&path)
|
||||
@@ -544,11 +541,11 @@ impl DeltaLayer {
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
Ok(DeltaLayerInner {
|
||||
Ok(Arc::new(DeltaLayerInner {
|
||||
file,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
@@ -613,6 +610,24 @@ impl DeltaLayer {
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Obtains all keys and value references stored in the layer
|
||||
///
|
||||
/// The value can be obtained via the [`ValueRef::load`] function.
|
||||
pub fn load_val_refs(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, ValueRef)>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.context("load delta layer")?;
|
||||
DeltaLayerInner::load_val_refs(inner).context("Layer index is corrupted")
|
||||
}
|
||||
|
||||
/// Loads all keys stored in the layer. Returns key, lsn and value size.
|
||||
pub fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.context("load delta layer keys")?;
|
||||
inner.load_keys().context("Layer index is corrupted")
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder object for constructing a new delta layer.
|
||||
@@ -800,7 +815,7 @@ impl DeltaLayerWriterInner {
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
|
||||
@@ -883,121 +898,41 @@ impl Drop for DeltaLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator over all key-value pairse stored in a delta layer
|
||||
///
|
||||
/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
|
||||
/// That takes up quite a lot of memory. Should do this in a more streaming
|
||||
/// fashion.
|
||||
///
|
||||
struct DeltaValueIter<'a> {
|
||||
all_offsets: Vec<(DeltaKey, BlobRef)>,
|
||||
next_idx: usize,
|
||||
reader: BlockCursor<Adapter<'a>>,
|
||||
}
|
||||
|
||||
struct Adapter<'a>(&'a DeltaLayerInner);
|
||||
|
||||
impl<'a> BlockReader for Adapter<'a> {
|
||||
type BlockLease = PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
self.0.file.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for DeltaValueIter<'a> {
|
||||
type Item = Result<(Key, Lsn, Value)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_res().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DeltaValueIter<'a> {
|
||||
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
|
||||
let file = &inner.file;
|
||||
impl DeltaLayerInner {
|
||||
fn load_val_refs(this: &Arc<DeltaLayerInner>) -> Result<Vec<(Key, Lsn, ValueRef)>> {
|
||||
let file = &this.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
this.index_start_blk,
|
||||
this.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
|
||||
let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new();
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let val_ref = ValueRef {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(Adapter(this.clone())),
|
||||
};
|
||||
all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
|
||||
let iter = DeltaValueIter {
|
||||
all_offsets,
|
||||
next_idx: 0,
|
||||
reader: BlockCursor::new(Adapter(inner)),
|
||||
};
|
||||
|
||||
Ok(iter)
|
||||
Ok(all_offsets)
|
||||
}
|
||||
|
||||
fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.next_idx < self.all_offsets.len() {
|
||||
let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
|
||||
|
||||
let key = delta_key.key();
|
||||
let lsn = delta_key.lsn();
|
||||
|
||||
let buf = self.reader.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
self.next_idx += 1;
|
||||
Ok(Some((key, lsn, val)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
///
|
||||
/// Iterator over all keys stored in a delta layer
|
||||
///
|
||||
/// FIXME: This creates a Vector to hold all keys.
|
||||
/// That takes up quite a lot of memory. Should do this in a more streaming
|
||||
/// fashion.
|
||||
///
|
||||
struct DeltaKeyIter {
|
||||
all_keys: Vec<(DeltaKey, u64)>,
|
||||
next_idx: usize,
|
||||
}
|
||||
|
||||
impl Iterator for DeltaKeyIter {
|
||||
type Item = (Key, Lsn, u64);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.next_idx < self.all_keys.len() {
|
||||
let (delta_key, size) = &self.all_keys[self.next_idx];
|
||||
|
||||
let key = delta_key.key();
|
||||
let lsn = delta_key.lsn();
|
||||
|
||||
self.next_idx += 1;
|
||||
Some((key, lsn, *size))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DeltaKeyIter {
|
||||
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
|
||||
let file = &inner.file;
|
||||
fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
|
||||
let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
@@ -1005,46 +940,48 @@ impl<'a> DeltaKeyIter {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
if last.0.key() == delta_key.key() {
|
||||
if last.0 == delta_key.key() {
|
||||
return true;
|
||||
} else {
|
||||
// subtract offset of new key BLOB and first blob of this key
|
||||
// to get total size if values associated with this key
|
||||
let first_pos = last.1;
|
||||
last.1 = pos - first_pos;
|
||||
let first_pos = last.2;
|
||||
last.2 = pos - first_pos;
|
||||
}
|
||||
}
|
||||
all_keys.push((delta_key, pos));
|
||||
all_keys.push((delta_key.key(), delta_key.lsn(), pos));
|
||||
true
|
||||
},
|
||||
)?;
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
// Last key occupies all space till end of layer
|
||||
last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
|
||||
last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
|
||||
}
|
||||
let iter = DeltaKeyIter {
|
||||
all_keys,
|
||||
next_idx: 0,
|
||||
};
|
||||
|
||||
Ok(iter)
|
||||
Ok(all_keys)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::DeltaKeyIter;
|
||||
use super::DeltaLayer;
|
||||
use super::DeltaValueIter;
|
||||
/// Reference to an on-disk value
|
||||
pub struct ValueRef {
|
||||
blob_ref: BlobRef,
|
||||
reader: BlockCursor<Adapter>,
|
||||
}
|
||||
|
||||
// We will soon need the iters to be send in the compaction code.
|
||||
// Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
|
||||
// Cf https://github.com/neondatabase/neon/issues/4471
|
||||
#[test]
|
||||
fn is_send() {
|
||||
fn assert_send<T: Send>() {}
|
||||
assert_send::<DeltaLayer>();
|
||||
assert_send::<DeltaValueIter>();
|
||||
assert_send::<DeltaKeyIter>();
|
||||
impl ValueRef {
|
||||
/// Loads the value from disk
|
||||
pub fn load(&self) -> Result<Value> {
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
Ok(val)
|
||||
}
|
||||
}
|
||||
|
||||
struct Adapter(Arc<DeltaLayerInner>);
|
||||
|
||||
impl BlockReader for Adapter {
|
||||
type BlockLease = PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
self.0.file.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,8 +57,9 @@ impl Ord for DeltaFileName {
|
||||
|
||||
/// Represents the filename of a DeltaLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN start>-<LSN end>
|
||||
///
|
||||
/// ```
|
||||
impl DeltaFileName {
|
||||
///
|
||||
/// Parse a string as a delta file name. Returns None if the filename does not
|
||||
@@ -162,7 +163,9 @@ impl ImageFileName {
|
||||
///
|
||||
/// Represents the filename of an ImageLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN>
|
||||
/// ```
|
||||
impl ImageFileName {
|
||||
///
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
|
||||
@@ -7,11 +7,15 @@
|
||||
//! timelines/<timeline_id> directory. Currently, there are no
|
||||
//! subdirectories, and each image layer file is named like this:
|
||||
//!
|
||||
//! ```text
|
||||
//! <key start>-<key end>__<LSN>
|
||||
//! ```
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! ```text
|
||||
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
//! ```
|
||||
//!
|
||||
//! Every image layer file consists of three parts: "summary",
|
||||
//! "index", and "values". The summary is a fixed size header at the
|
||||
@@ -23,7 +27,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -34,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use hex;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -43,7 +48,6 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -53,9 +57,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use super::filename::ImageFileName;
|
||||
use super::{
|
||||
AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
|
||||
};
|
||||
use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -113,7 +115,7 @@ pub struct ImageLayer {
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: RwLock<ImageLayerInner>,
|
||||
inner: OnceCell<ImageLayerInner>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayer {
|
||||
@@ -130,30 +132,27 @@ impl std::fmt::Debug for ImageLayer {
|
||||
}
|
||||
|
||||
pub struct ImageLayerInner {
|
||||
/// If false, the 'index' has not been loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
// values copied from summary
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||
file: Option<FileBlockReader<VirtualFile>>,
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("ImageLayerInner")
|
||||
.field("loaded", &self.loaded)
|
||||
.field("index_start_blk", &self.index_start_blk)
|
||||
.field("index_root_blk", &self.index_root_blk)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -170,11 +169,11 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx)?;
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let file = &inner.file;
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
tree_reader.dump()?;
|
||||
tree_reader.dump().await?;
|
||||
|
||||
tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
|
||||
println!("key: {} offset {}", hex::encode(key), value);
|
||||
@@ -185,7 +184,7 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -198,7 +197,7 @@ impl Layer for ImageLayer {
|
||||
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
@@ -254,10 +253,6 @@ impl PersistentLayer for ImageLayer {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
@@ -317,52 +312,26 @@ impl ImageLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<RwLockReadGuard<ImageLayerInner>> {
|
||||
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
loop {
|
||||
// Quick exit if already loaded
|
||||
let inner = self.inner.read().unwrap();
|
||||
if inner.loaded {
|
||||
if let Some(inner) = self.inner.get() {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
// Need to open the file and load the metadata. Upgrade our lock to
|
||||
// a write lock. (Or rather, release and re-lock in write mode.)
|
||||
drop(inner);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(&mut inner).with_context(|| {
|
||||
format!("Failed to load image layer {}", self.path().display())
|
||||
})?
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
|
||||
// We now have the file open and loaded. There's no function to do
|
||||
// that in the std library RwLock, so we have to release and re-lock
|
||||
// in read mode. (To be precise, the lock guard was moved in the
|
||||
// above call to `load_inner`, so it's already been released). And
|
||||
// while we do that, another thread could unload again, so we have
|
||||
// to re-check and retry if that happens.
|
||||
drop(inner);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
|
||||
}
|
||||
}
|
||||
|
||||
fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
|
||||
fn load_inner(&self) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.file.is_none() {
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
inner.file = Some(FileBlockReader::new(file));
|
||||
}
|
||||
let file = inner.file.as_mut().unwrap();
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
@@ -390,10 +359,11 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
inner.index_start_blk = actual_summary.index_start_blk;
|
||||
inner.index_root_blk = actual_summary.index_root_blk;
|
||||
inner.loaded = true;
|
||||
Ok(())
|
||||
Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
file,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk
|
||||
@@ -417,12 +387,7 @@ impl ImageLayer {
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: filename.lsn,
|
||||
access_stats,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
}),
|
||||
inner: OnceCell::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -449,12 +414,7 @@ impl ImageLayer {
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
file: None,
|
||||
loaded: false,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
}),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -615,12 +575,7 @@ impl ImageLayerWriterInner {
|
||||
desc,
|
||||
lsn: self.lsn,
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
}),
|
||||
inner: OnceCell::new(),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
@@ -660,7 +615,7 @@ impl ImageLayerWriterInner {
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
|
||||
/// As described in <https://github.com/neondatabase/neon/issues/2650>, it's
|
||||
/// possible for the writer to drop before `finish` is actually called. So this
|
||||
/// could lead to odd temporary files in the directory, exhausting file system.
|
||||
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
@@ -110,6 +110,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
@@ -132,7 +133,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
@@ -150,7 +151,7 @@ impl Layer for InMemoryLayer {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut cursor = inner.file.block_cursor();
|
||||
let cursor = inner.file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
@@ -183,7 +184,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -195,7 +196,7 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let mut reader = inner.file.block_cursor();
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
@@ -353,7 +354,7 @@ impl InMemoryLayer {
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let mut cursor = inner.file.block_cursor();
|
||||
let cursor = inner.file.block_cursor();
|
||||
|
||||
let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
|
||||
keys.sort_by_key(|k| k.0);
|
||||
|
||||
@@ -20,12 +20,12 @@ use utils::{
|
||||
|
||||
use super::filename::{DeltaFileName, ImageFileName};
|
||||
use super::{
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
|
||||
LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
/// [`crate::storage_layer::DeltaLayer`].
|
||||
/// [`DeltaLayer`](super::DeltaLayer).
|
||||
///
|
||||
/// RemoteLayer might be downloaded on-demand during operations which are
|
||||
/// allowed download remote layers and during which, it gets replaced with a
|
||||
@@ -50,6 +50,8 @@ pub struct RemoteLayer {
|
||||
/// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
|
||||
/// a possible fast loop between `Timeline::get_reconstruct_data` and
|
||||
/// `Timeline::download_remote_layer`, which also logs.
|
||||
///
|
||||
/// [`ongoing_download`]: Self::ongoing_download
|
||||
pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
@@ -63,8 +65,9 @@ impl std::fmt::Debug for RemoteLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for RemoteLayer {
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
@@ -75,7 +78,7 @@ impl Layer for RemoteLayer {
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -126,14 +129,6 @@ impl PersistentLayer for RemoteLayer {
|
||||
None
|
||||
}
|
||||
|
||||
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
bail!("remote layer has no layer file");
|
||||
}
|
||||
|
||||
@@ -111,7 +111,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
} else {
|
||||
@@ -122,12 +122,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -196,12 +196,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "gc");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -263,9 +263,9 @@ pub(crate) async fn random_init_delay(
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
_ = tokio::time::sleep(d) => Ok(()),
|
||||
match tokio::time::timeout(d, cancel.cancelled()).await {
|
||||
Ok(_) => Err(Cancelled),
|
||||
Err(_) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
576
pageserver/src/tenant/timeline/delete.rs
Normal file
576
pageserver/src/tenant/timeline/delete.rs
Normal file
@@ -0,0 +1,576 @@
|
||||
use std::{
|
||||
ops::{Deref, DerefMut},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
|
||||
use utils::{
|
||||
crashsafe, fs_ext,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{
|
||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||
},
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
// Stop the walreceiver first.
|
||||
debug!("waiting for wal receiver to shutdown");
|
||||
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
||||
if let Some(walreceiver) = maybe_started_walreceiver {
|
||||
walreceiver.stop().await;
|
||||
}
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.stop();
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) => match e {
|
||||
remote_timeline_client::StopError::QueueUninitialized => {
|
||||
// This case shouldn't happen currently because the
|
||||
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
|
||||
// That is, before we declare the Tenant as Active.
|
||||
// But we only allow calls to delete_timeline on Active tenants.
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
||||
// so, they are not affected by this shutdown_tasks() call.
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-deleted-at"
|
||||
))?
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
/// during attach or pageserver restart.
|
||||
/// See comment in persist_index_part_with_deleted_flag.
|
||||
async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
match remote_client.persist_index_part_with_deleted_flag().await {
|
||||
// If we (now, or already) marked it successfully as deleted, we can proceed
|
||||
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
|
||||
// Bail out otherwise
|
||||
//
|
||||
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
|
||||
// two tasks from performing the deletion at the same time. The first task
|
||||
// that starts deletion should run it to completion.
|
||||
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
|
||||
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
|
||||
// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
|
||||
// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
|
||||
// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
|
||||
// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
|
||||
// So we can just remove the mark file.
|
||||
async fn create_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
fail::fail_point!("timeline-delete-before-delete-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-delete-mark"
|
||||
))?
|
||||
});
|
||||
let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
|
||||
|
||||
// Note: we're ok to replace existing file.
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Grab the layer_removal_cs lock, and actually perform the deletion.
|
||||
///
|
||||
/// This lock prevents prevents GC or compaction from running at the same time.
|
||||
/// The GC task doesn't register itself with the timeline it's operating on,
|
||||
/// so it might still be running even though we called `shutdown_tasks`.
|
||||
///
|
||||
/// Note that there are still other race conditions between
|
||||
/// GC, compaction and timeline deletion. See
|
||||
/// <https://github.com/neondatabase/neon/issues/2671>
|
||||
///
|
||||
/// No timeout here, GC & Compaction should be responsive to the
|
||||
/// `TimelineState::Stopping` change.
|
||||
async fn delete_local_layer_files(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline: &Timeline,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("waiting for layer_removal_cs.lock()");
|
||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||
|
||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||
// by the caller.
|
||||
|
||||
let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||
});
|
||||
|
||||
// NB: This need not be atomic because the deleted flag in the IndexPart
|
||||
// will be observed during tenant/timeline load. The deletion will be resumed there.
|
||||
//
|
||||
// For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
|
||||
//
|
||||
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
|
||||
// This can happen if we're called a second time, e.g.,
|
||||
// because of a previous failure/cancellation at/after
|
||||
// failpoint timeline-delete-after-rm.
|
||||
//
|
||||
// It can also happen if we race with tenant detach, because,
|
||||
// it doesn't grab the layer_removal_cs lock.
|
||||
//
|
||||
// For now, log and continue.
|
||||
// warn! level is technically not appropriate for the
|
||||
// first case because we should expect retries to happen.
|
||||
// But the error is so rare, it seems better to get attention if it happens.
|
||||
//
|
||||
// Note that metadata removal is skipped, this is not technically needed,
|
||||
// but allows to reuse timeline loading code during resumed deletion.
|
||||
// (we always expect that metadata is in place when timeline is being loaded)
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
let mut counter = 0;
|
||||
|
||||
// Timeline directory may not exist if we failed to delete mark file and request was retried.
|
||||
if !local_timeline_directory.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
|
||||
|
||||
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
counter += 1;
|
||||
if counter == 2 {
|
||||
fail::fail_point!("timeline-delete-during-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let entry = entry?;
|
||||
if entry.path() == metadata_path {
|
||||
debug!("found metadata, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.path() == local_timeline_directory {
|
||||
// Keeping directory because metedata file is still there
|
||||
debug!("found timeline dir itself, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
let metadata = match entry.metadata() {
|
||||
Ok(metadata) => metadata,
|
||||
Err(e) => {
|
||||
if crate::is_walkdir_io_not_found(&e) {
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
path=?entry.path().display(),
|
||||
"got not found err while removing timeline dir, proceeding anyway"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
anyhow::bail!(e);
|
||||
}
|
||||
};
|
||||
|
||||
let r = if metadata.is_dir() {
|
||||
// There shouldnt be any directories inside timeline dir as of current layout.
|
||||
tokio::fs::remove_dir(entry.path()).await
|
||||
} else {
|
||||
tokio::fs::remove_file(entry.path()).await
|
||||
};
|
||||
|
||||
if let Err(e) = r {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
path=?entry.path().display(),
|
||||
"got not found err while removing timeline dir, proceeding anyway"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
anyhow::bail!(anyhow::anyhow!(
|
||||
"Failed to remove: {}. Error: {e}",
|
||||
entry.path().display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||
drop(layer_removal_guard);
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Removes remote layers and an index file after them.
|
||||
async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
|
||||
if let Some(remote_client) = &timeline.remote_client {
|
||||
remote_client.delete_all().await.context("delete_all")?
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This function removs remaining traces of a timeline on disk.
|
||||
// Namely: metadata file, timeline directory, delete mark.
|
||||
// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
|
||||
// delete mark should be present because it is the last step during deletion.
|
||||
// (nothing can fail after its deletion)
|
||||
async fn cleanup_remaining_timeline_fs_traces(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove local metadata
|
||||
tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("remove metadata")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-after-rm-metadata"
|
||||
))?
|
||||
});
|
||||
|
||||
// Remove timeline dir
|
||||
tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("timeline dir")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm-dir", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
|
||||
});
|
||||
|
||||
// Remove delete mark
|
||||
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
|
||||
.await
|
||||
.context("remove delete mark")
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn remove_timeline_from_tenant(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = tenant.timelines.lock().unwrap();
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
|
||||
// We already deleted the layer files, so it's probably best to panic.
|
||||
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
|
||||
if children_exist {
|
||||
panic!("Timeline grew children while we removed layer files");
|
||||
}
|
||||
|
||||
timelines
|
||||
.remove(&timeline_id)
|
||||
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
|
||||
|
||||
drop(timelines);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
/// 1. Set deleted_at in remote index part.
|
||||
/// 2. Create local mark file.
|
||||
/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
|
||||
/// 4. Delete remote layers
|
||||
/// 5. Delete index part
|
||||
/// 6. Delete meta, timeline directory
|
||||
/// 7. Delete mark file
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are three entrypoints to the process:
|
||||
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
|
||||
/// and we possibly neeed to continue deletion of remote files.
|
||||
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
|
||||
/// index but still have local metadata, timeline directory and delete mark.
|
||||
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTimelineFlow {
|
||||
#[default]
|
||||
NotStarted,
|
||||
InProgress,
|
||||
Finished,
|
||||
}
|
||||
|
||||
impl DeleteTimelineFlow {
|
||||
// These steps are run in the context of management api request handler.
|
||||
// Long running steps are continued to run in the background.
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
|
||||
pub async fn run(
|
||||
tenant: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
stop_tasks(&timeline).await?;
|
||||
|
||||
set_deleted_in_remote_index(&timeline).await?;
|
||||
|
||||
create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-schedule"
|
||||
))?
|
||||
});
|
||||
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
|
||||
Self::InProgress { .. } => { /* We're in a retry */ }
|
||||
Self::NotStarted => { /* Fresh start */ }
|
||||
}
|
||||
|
||||
*self = Self::InProgress;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Shortcut to create Timeline in stopping state and spawn deletion task.
|
||||
pub async fn resume_deletion(
|
||||
tenant: Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: &TimelineMetadata,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||
// RemoteTimelineClient is the only functioning part.
|
||||
let timeline = tenant
|
||||
.create_timeline_struct(
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
None, // Ancestor is not needed for deletion.
|
||||
remote_client,
|
||||
init_order,
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
)
|
||||
.context("create_timeline_struct")?;
|
||||
|
||||
let mut guard = DeletionGuard(
|
||||
Arc::clone(&timeline.delete_progress)
|
||||
.try_lock_owned()
|
||||
.expect("cannot happen because we're the only owner"),
|
||||
);
|
||||
|
||||
// We meed to do this because when console retries delete request we shouldnt answer with 404
|
||||
// because 404 means successful deletion.
|
||||
{
|
||||
let mut locked = tenant.timelines.lock().unwrap();
|
||||
locked.insert(timeline_id, Arc::clone(&timeline));
|
||||
}
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
// Note that delete mark can be missing on resume
|
||||
// because we create delete mark after we set deleted_at in the index part.
|
||||
create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
|
||||
|
||||
Self::schedule_background(guard, tenant.conf, tenant, timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn cleanup_remaining_timeline_fs_traces(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
|
||||
}
|
||||
|
||||
fn prepare(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
|
||||
// Note the interaction between this guard and deletion guard.
|
||||
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
|
||||
// This is important because when you take into account `remove_timeline_from_tenant`
|
||||
// we remove timeline from memory when we still hold the deletion guard.
|
||||
// So here when timeline deletion is finished timeline wont be present in timelines map at all
|
||||
// which makes the following sequence impossible:
|
||||
// T1: get preempted right before the try_lock on `Timeline::delete_progress`
|
||||
// T2: do a full deletion, acquire and drop `Timeline::delete_progress`
|
||||
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
|
||||
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
|
||||
let timelines = tenant.timelines.lock().unwrap();
|
||||
|
||||
let timeline = match timelines.get(&timeline_id) {
|
||||
Some(t) => t,
|
||||
None => return Err(DeleteTimelineError::NotFound),
|
||||
};
|
||||
|
||||
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||
// because detach removes files, which will break child branches
|
||||
let children: Vec<TimelineId> = timelines
|
||||
.iter()
|
||||
.filter_map(|(id, entry)| {
|
||||
if entry.get_ancestor_timeline_id() == Some(timeline_id) {
|
||||
Some(*id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !children.is_empty() {
|
||||
return Err(DeleteTimelineError::HasChildren(children));
|
||||
}
|
||||
|
||||
// Note that using try_lock here is important to avoid a deadlock.
|
||||
// Here we take lock on timelines and then the deletion guard.
|
||||
// At the end of the operation we're holding the guard and need to lock timelines map
|
||||
// to remove the timeline from it.
|
||||
// Always if you have two locks that are taken in different order this can result in a deadlock.
|
||||
let delete_lock_guard = DeletionGuard(
|
||||
Arc::clone(&timeline.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
|
||||
);
|
||||
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
|
||||
Ok((Arc::clone(timeline), delete_lock_guard))
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: DeletionGuard,
|
||||
conf: &'static PageServerConf,
|
||||
tenant: Arc<Tenant>,
|
||||
timeline: Arc<Timeline>,
|
||||
) {
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
false,
|
||||
async move {
|
||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
|
||||
error!("Error: {err:#}");
|
||||
timeline.set_broken(format!("{err:#}"))
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span =
|
||||
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
async fn background(
|
||||
mut guard: DeletionGuard,
|
||||
conf: &PageServerConf,
|
||||
tenant: &Tenant,
|
||||
timeline: &Timeline,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
|
||||
|
||||
delete_remote_layers_and_index(timeline).await?;
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
|
||||
|
||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||
|
||||
*guard.0 = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
|
||||
|
||||
impl Deref for DeletionGuard {
|
||||
type Target = DeleteTimelineFlow;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for DeletionGuard {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
@@ -30,6 +30,7 @@ use crate::{
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
storage_layer::PersistentLayer,
|
||||
timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
};
|
||||
@@ -100,11 +101,11 @@ impl Timeline {
|
||||
match cf {
|
||||
ControlFlow::Break(()) => break,
|
||||
ControlFlow::Continue(sleep_until) => {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
break;
|
||||
}
|
||||
_ = tokio::time::sleep_until(sleep_until) => { }
|
||||
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -270,20 +271,22 @@ impl Timeline {
|
||||
None => {
|
||||
stats.skipped_for_shutdown += 1;
|
||||
}
|
||||
Some(Ok(true)) => {
|
||||
debug!("evicted layer {l:?}");
|
||||
Some(Ok(())) => {
|
||||
stats.evicted += 1;
|
||||
}
|
||||
Some(Ok(false)) => {
|
||||
debug!("layer is not evictable: {l:?}");
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
// This variant is the case where an unexpected error happened during eviction.
|
||||
// Expected errors that result in non-eviction are `Some(Ok(false))`.
|
||||
// So, dump Debug here to gather as much info as possible in this rare case.
|
||||
warn!("failed to evict layer {l:?}: {e:?}");
|
||||
stats.errors += 1;
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
// compaction/gc removed the file while we were waiting on layer_removal_cs
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Some(Err(
|
||||
e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(layer = %l, "failed to evict layer: {e}");
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -305,8 +308,13 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
let mut state = self.eviction_task_timeline_state.lock().await;
|
||||
|
||||
// Only do the imitate_layer accesses approximately as often as the threshold. A little
|
||||
// more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
|
||||
let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
|
||||
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
|
||||
.await;
|
||||
@@ -329,7 +337,7 @@ impl Timeline {
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
|
||||
.await;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user