Compare commits

..

3 Commits

Author SHA1 Message Date
Alex Chi Z
e76b2061bd fix comments
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-05-08 10:50:30 -04:00
Alex Chi Z
4a27dd743b Update pageserver/src/aux_file.rs
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2024-05-08 10:46:58 -04:00
Alex Chi Z
5629f65841 chore(pageserver): use sha256 for aux file encoding
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-05-07 15:42:16 -04:00
218 changed files with 3708 additions and 7154 deletions

View File

@@ -1,2 +1,2 @@
[profile.default]
slow-timeout = { period = "60s", terminate-after = 3 }
slow-timeout = { period = "20s", terminate-after = 3 }

View File

@@ -1,11 +1,12 @@
self-hosted-runner:
labels:
- arm64
- dev
- gen3
- large
- large-arm64
# Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
- macos-14
- small
- small-arm64
- us-east-2
config-variables:
- REMOTE_STORAGE_AZURE_CONTAINER

View File

@@ -39,7 +39,7 @@ jobs:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
env:
IMAGE_TAG: ${{ inputs.image-tag }}

View File

@@ -341,9 +341,6 @@ jobs:
env:
NEXTEST_RETRIES: 3
run: |
#nextest does not yet support running doctests
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
@@ -546,27 +543,9 @@ jobs:
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
report-benchmarks-failures:
needs: [ benchmarks, create-test-report ]
if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
runs-on: ubuntu-latest
steps:
- uses: slackapi/slack-github-action@v1
with:
channel-id: C060CNA47S9 # on-call-staging-storage-stream
slack-message: |
Benchmarks failed on main: ${{ github.event.head_commit.url }}
Allure report: ${{ needs.create-test-report.outputs.report-url }}
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
create-test-report:
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
outputs:
report-url: ${{ steps.create-allure-report.outputs.report-url }}
runs-on: [ self-hosted, gen3, small ]
container:

View File

@@ -136,7 +136,7 @@ jobs:
check-linux-arm-build:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ]
runs-on: [ self-hosted, dev, arm64 ]
env:
# Use release build only, to have less debug info around
@@ -232,20 +232,20 @@ jobs:
- name: Run cargo build
run: |
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run cargo test
env:
NEXTEST_RETRIES: 3
run: |
cargo nextest run $CARGO_FEATURES -j$(nproc)
cargo nextest run $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
cargo nextest run --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
@@ -255,12 +255,12 @@ jobs:
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
cargo nextest run --package remote_storage --test test_real_azure
check-codestyle-rust-arm:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ]
runs-on: [ self-hosted, dev, arm64 ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -269,11 +269,6 @@ jobs:
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
steps:
- name: Fix git ownership
run: |
@@ -310,35 +305,31 @@ jobs:
exit 1
fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
if: matrix.build_type == 'debug'
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release)
if: matrix.build_type == 'release'
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
if: matrix.build_type == 'release'
run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
run: cargo doc --workspace --no-deps --document-private-items
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() && matrix.build_type == 'release' }}
if: ${{ !cancelled() }}
run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies
if: ${{ !cancelled() && matrix.build_type == 'release' }}
if: ${{ !cancelled() }}
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() && matrix.build_type == 'release' }}
if: ${{ !cancelled() }}
run: cargo deny check
gather-rust-build-stats:
@@ -347,7 +338,7 @@ jobs:
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
runs-on: [ self-hosted, large ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
@@ -378,7 +369,7 @@ jobs:
run: make walproposer-lib -j$(nproc)
- name: Produce the build stats
run: cargo build --all --release --timings -j$(nproc)
run: cargo build --all --release --timings
- name: Upload the build stats
id: upload-stats

229
Cargo.lock generated
View File

@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "ahash"
version = "0.8.11"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
dependencies = [
"cfg-if",
"const-random",
@@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "aws-config"
version = "1.3.0"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d"
checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -309,15 +309,14 @@ dependencies = [
"time",
"tokio",
"tracing",
"url",
"zeroize",
]
[[package]]
name = "aws-credential-types"
version = "1.2.0"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9"
checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
dependencies = [
"aws-smithy-async",
"aws-smithy-runtime-api",
@@ -327,9 +326,9 @@ dependencies = [
[[package]]
name = "aws-runtime"
version = "1.2.1"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1"
checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
dependencies = [
"aws-credential-types",
"aws-sigv4",
@@ -374,11 +373,10 @@ dependencies = [
[[package]]
name = "aws-sdk-s3"
version = "1.26.0"
version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d"
checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113"
dependencies = [
"ahash",
"aws-credential-types",
"aws-runtime",
"aws-sigv4",
@@ -393,25 +391,20 @@ dependencies = [
"aws-smithy-xml",
"aws-types",
"bytes",
"fastrand 2.0.0",
"hex",
"hmac",
"http 0.2.9",
"http-body 0.4.5",
"lru",
"once_cell",
"percent-encoding",
"regex-lite",
"sha2",
"tracing",
"url",
]
[[package]]
name = "aws-sdk-sso"
version = "1.22.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601"
checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -431,9 +424,9 @@ dependencies = [
[[package]]
name = "aws-sdk-ssooidc"
version = "1.22.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570"
checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -453,9 +446,9 @@ dependencies = [
[[package]]
name = "aws-sdk-sts"
version = "1.22.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5"
checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -476,9 +469,9 @@ dependencies = [
[[package]]
name = "aws-sigv4"
version = "1.2.1"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57"
checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
dependencies = [
"aws-credential-types",
"aws-smithy-eventstream",
@@ -505,9 +498,9 @@ dependencies = [
[[package]]
name = "aws-smithy-async"
version = "1.2.1"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
dependencies = [
"futures-util",
"pin-project-lite",
@@ -516,9 +509,9 @@ dependencies = [
[[package]]
name = "aws-smithy-checksums"
version = "0.60.7"
version = "0.60.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f"
checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b"
dependencies = [
"aws-smithy-http",
"aws-smithy-types",
@@ -548,9 +541,9 @@ dependencies = [
[[package]]
name = "aws-smithy-http"
version = "0.60.8"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08"
checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
dependencies = [
"aws-smithy-eventstream",
"aws-smithy-runtime-api",
@@ -588,9 +581,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
version = "1.5.0"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb"
checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
@@ -601,7 +594,6 @@ dependencies = [
"h2 0.3.26",
"http 0.2.9",
"http-body 0.4.5",
"http-body 1.0.0",
"hyper 0.14.26",
"hyper-rustls 0.24.0",
"once_cell",
@@ -614,9 +606,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime-api"
version = "1.6.0"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47"
checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
dependencies = [
"aws-smithy-async",
"aws-smithy-types",
@@ -631,19 +623,16 @@ dependencies = [
[[package]]
name = "aws-smithy-types"
version = "1.1.9"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1"
checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
dependencies = [
"base64-simd",
"bytes",
"bytes-utils",
"futures-core",
"http 0.2.9",
"http 1.1.0",
"http-body 0.4.5",
"http-body 1.0.0",
"http-body-util",
"itoa",
"num-integer",
"pin-project-lite",
@@ -657,18 +646,18 @@ dependencies = [
[[package]]
name = "aws-smithy-xml"
version = "0.60.8"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55"
checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
dependencies = [
"xmlparser",
]
[[package]]
name = "aws-types"
version = "1.2.0"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814"
checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
dependencies = [
"aws-credential-types",
"aws-smithy-async",
@@ -708,7 +697,7 @@ dependencies = [
"sha1",
"sync_wrapper",
"tokio",
"tokio-tungstenite",
"tokio-tungstenite 0.20.0",
"tower",
"tower-layer",
"tower-service",
@@ -979,12 +968,6 @@ version = "3.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
[[package]]
name = "bytemuck"
version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
[[package]]
name = "byteorder"
version = "1.4.3"
@@ -1072,9 +1055,9 @@ dependencies = [
[[package]]
name = "chrono"
version = "0.4.38"
version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
dependencies = [
"android-tzdata",
"iana-time-zone",
@@ -1082,7 +1065,7 @@ dependencies = [
"num-traits",
"serde",
"wasm-bindgen",
"windows-targets 0.52.4",
"windows-targets 0.48.0",
]
[[package]]
@@ -1109,7 +1092,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
dependencies = [
"ciborium-io",
"half 1.8.2",
"half",
]
[[package]]
@@ -1239,10 +1222,8 @@ dependencies = [
"serde_json",
"signal-hook",
"tar",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-stream",
"tokio-util",
"toml_edit",
"tracing",
@@ -1604,7 +1585,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
dependencies = [
"cfg-if",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"lock_api",
"once_cell",
"parking_lot_core 0.9.8",
@@ -2005,27 +1986,6 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "framed-websockets"
version = "0.1.0"
source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
dependencies = [
"base64 0.21.1",
"bytemuck",
"bytes",
"futures-core",
"futures-sink",
"http-body-util",
"hyper 1.2.0",
"hyper-util",
"pin-project",
"rand 0.8.5",
"sha1",
"thiserror",
"tokio",
"tokio-util",
]
[[package]]
name = "fs2"
version = "0.4.3"
@@ -2278,17 +2238,6 @@ version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
[[package]]
name = "half"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
dependencies = [
"cfg-if",
"crunchy",
"num-traits",
]
[[package]]
name = "hash32"
version = "0.3.1"
@@ -2315,9 +2264,9 @@ dependencies = [
[[package]]
name = "hashbrown"
version = "0.14.5"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
dependencies = [
"ahash",
"allocator-api2",
@@ -2325,11 +2274,11 @@ dependencies = [
[[package]]
name = "hashlink"
version = "0.9.1"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
dependencies = [
"hashbrown 0.14.5",
"hashbrown 0.14.0",
]
[[package]]
@@ -2638,6 +2587,21 @@ dependencies = [
"tokio-native-tls",
]
[[package]]
name = "hyper-tungstenite"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
dependencies = [
"http-body-util",
"hyper 1.2.0",
"hyper-util",
"pin-project-lite",
"tokio",
"tokio-tungstenite 0.21.0",
"tungstenite 0.21.0",
]
[[package]]
name = "hyper-util"
version = "0.1.3"
@@ -2715,7 +2679,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
dependencies = [
"equivalent",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
]
[[package]]
@@ -2971,15 +2935,6 @@ version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "lru"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
dependencies = [
"hashbrown 0.14.5",
]
[[package]]
name = "match_cfg"
version = "0.1.0"
@@ -3030,7 +2985,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
dependencies = [
"bytes",
"crossbeam-utils",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"itoa",
"lasso",
"measured-derive",
@@ -3592,7 +3547,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
dependencies = [
"dlv-list",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
]
[[package]]
@@ -3737,6 +3692,7 @@ dependencies = [
"serde_json",
"serde_path_to_error",
"serde_with",
"sha2",
"signal-hook",
"smallvec",
"storage_broker",
@@ -3913,14 +3869,13 @@ dependencies = [
[[package]]
name = "parquet"
version = "51.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"ahash",
"bytes",
"chrono",
"half 2.4.1",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"num",
"num-bigint",
"paste",
@@ -3928,13 +3883,12 @@ dependencies = [
"thrift",
"twox-hash",
"zstd",
"zstd-sys",
]
[[package]]
name = "parquet_derive"
version = "51.0.0"
source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"parquet",
"proc-macro2",
@@ -4405,10 +4359,9 @@ dependencies = [
"dashmap",
"env_logger",
"fallible-iterator",
"framed-websockets",
"futures",
"git-version",
"hashbrown 0.14.5",
"hashbrown 0.13.2",
"hashlink",
"hex",
"hmac",
@@ -4418,6 +4371,7 @@ dependencies = [
"humantime",
"hyper 0.14.26",
"hyper 1.2.0",
"hyper-tungstenite",
"hyper-util",
"indexmap 2.0.1",
"ipnet",
@@ -4462,6 +4416,7 @@ dependencies = [
"smol_str",
"socket2 0.5.5",
"subtle",
"sync_wrapper",
"task-local-extensions",
"thiserror",
"tikv-jemalloc-ctl",
@@ -4470,7 +4425,6 @@ dependencies = [
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.25.0",
"tokio-tungstenite",
"tokio-util",
"tower-service",
"tracing",
@@ -5979,7 +5933,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
[[package]]
name = "svg_fmt"
version = "0.4.2"
source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
[[package]]
name = "syn"
@@ -6407,7 +6361,19 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite",
"tungstenite 0.20.1",
]
[[package]]
name = "tokio-tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.21.0",
]
[[package]]
@@ -6421,7 +6387,7 @@ dependencies = [
"futures-io",
"futures-sink",
"futures-util",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"pin-project-lite",
"tokio",
"tracing",
@@ -6703,6 +6669,25 @@ dependencies = [
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
dependencies = [
"byteorder",
"bytes",
"data-encoding",
"http 1.1.0",
"httparse",
"log",
"rand 0.8.5",
"sha1",
"thiserror",
"url",
"utf-8",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
@@ -7498,7 +7483,7 @@ dependencies = [
"futures-sink",
"futures-util",
"getrandom 0.2.11",
"hashbrown 0.14.5",
"hashbrown 0.14.0",
"hex",
"hmac",
"hyper 0.14.26",

View File

@@ -52,14 +52,14 @@ azure_storage_blobs = "0.19"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
aws-config = { version = "1.3", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.26"
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.14"
aws-sdk-iam = "1.15.0"
aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.9"
aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
aws-types = "1.2.0"
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.4"
aws-credential-types = "1.1.4"
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
aws-types = "1.1.7"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
@@ -81,14 +81,13 @@ enum-map = "2.4.2"
enumset = "1.0.12"
fail = "0.5.0"
fallible-iterator = "0.2"
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
fs2 = "0.4.3"
futures = "0.3"
futures-core = "0.3"
futures-util = "0.3"
git-version = "0.3"
hashbrown = "0.14"
hashlink = "0.9.1"
hashbrown = "0.13"
hashlink = "0.8.4"
hdrhistogram = "7.5.2"
hex = "0.4"
hex-literal = "0.4"
@@ -99,7 +98,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
tokio-tungstenite = "0.20.0"
hyper-tungstenite = "0.13.0"
indexmap = "2"
inotify = "0.10.2"
ipnet = "2.9.0"
@@ -122,8 +121,8 @@ opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0"
parking_lot = "0.12"
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "51.0.0"
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "49.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
procfs = "0.14"
@@ -159,8 +158,8 @@ socket2 = "0.5"
strum = "0.24"
strum_macros = "0.24"
"subtle" = "2.5.0"
# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
# https://github.com/nical/rust_debug/pull/4
svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
sync_wrapper = "0.1.2"
tar = "0.4"
task-local-extensions = "0.1.4"
@@ -244,8 +243,8 @@ tonic-build = "0.9"
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
################# Binary contents sections

View File

@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
&& rm awscliv2.zip
# Mold: A Modern Linker
ENV MOLD_VERSION v2.31.0
ENV MOLD_VERSION v2.4.0
RUN set -e \
&& git clone https://github.com/rui314/mold.git \
&& mkdir mold/build \

View File

@@ -27,12 +27,10 @@ reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-util.workspace = true
tokio-stream.workspace = true
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
thiserror.workspace = true
url.workspace = true
compute_api.workspace = true

View File

@@ -51,7 +51,6 @@ use tracing::{error, info, warn};
use url::Url;
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -70,34 +69,6 @@ use compute_tools::swap::resize_swap;
const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> {
let (build_tag, clap_args) = init()?;
let (pg_handle, start_pg_result) = {
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_args = process_cli(&clap_args)?;
let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
start_postgres(&clap_args, wait_spec_result)?
// Startup is finished, exit the startup tracing span
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
maybe_delay_exit(delay_exit);
deinit_and_exit(wait_pg_result);
}
fn init() -> Result<(String, clap::ArgMatches)> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -112,15 +83,9 @@ fn init() -> Result<(String, clap::ArgMatches)> {
.to_string();
info!("build_tag: {build_tag}");
Ok((build_tag, cli().get_matches()))
}
fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
let pgbin_default = "postgres";
let pgbin = matches
.get_one::<String>("pgbin")
.map(|s| s.as_str())
.unwrap_or(pgbin_default);
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
@@ -148,30 +113,6 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
let spec_path = matches.get_one::<String>("spec-path");
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
Ok(ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
spec_json,
spec_path,
resize_swap_on_bind,
})
}
struct ProcessCliResult<'clap> {
connstr: &'clap str,
pgdata: &'clap str,
pgbin: &'clap str,
ext_remote_storage: Option<&'clap str>,
http_port: u16,
spec_json: Option<&'clap String>,
spec_path: Option<&'clap String>,
resize_swap_on_bind: bool,
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
@@ -208,7 +149,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
@@ -218,17 +159,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
Some(guard)
} else {
None
}
}
};
fn try_spec_from_cli(
matches: &clap::ArgMatches,
ProcessCliResult {
spec_json,
spec_path,
..
}: &ProcessCliResult,
) -> Result<CliSpecParams> {
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
@@ -269,34 +201,6 @@ fn try_spec_from_cli(
}
};
Ok(CliSpecParams {
spec,
live_config_allowed,
})
}
struct CliSpecParams {
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
live_config_allowed: bool,
}
fn wait_spec(
build_tag: String,
ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
resize_swap_on_bind,
http_port,
..
}: ProcessCliResult,
CliSpecParams {
spec,
live_config_allowed,
}: CliSpecParams,
) -> Result<WaitSpecResult> {
let mut new_state = ComputeState::new();
let spec_set;
@@ -335,6 +239,8 @@ fn wait_spec(
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
@@ -363,29 +269,6 @@ fn wait_spec(
state.start_time = now;
}
Ok(WaitSpecResult {
compute,
http_port,
resize_swap_on_bind,
})
}
struct WaitSpecResult {
compute: Arc<ComputeNode>,
// passed through from ProcessCliResult
http_port: u16,
resize_swap_on_bind: bool,
}
fn start_postgres(
// need to allow unused because `matches` is only used if target_os = "linux"
#[allow(unused_variables)] matches: &clap::ArgMatches,
WaitSpecResult {
compute,
http_port,
resize_swap_on_bind,
}: WaitSpecResult,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
state.status = ComputeStatus::Init;
@@ -435,10 +318,10 @@ fn start_postgres(
}
}
let extension_server_port: u16 = http_port;
// Start Postgres
let mut pg = None;
let mut exit_code = None;
if !prestartup_failed {
pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg),
@@ -493,7 +376,7 @@ fn start_postgres(
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
let vm_monitor = rt.as_ref().map(|rt| {
let vm_monitor = &rt.as_ref().map(|rt| {
rt.spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(),
@@ -506,41 +389,12 @@ fn start_postgres(
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
rt,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
rt: Option<tokio::runtime::Runtime>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
// Startup is finished, exit the startup tracing span
drop(startup_context_guard);
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
@@ -555,25 +409,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_after_postgres_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
rt,
}: StartPostgresResult,
) -> Result<bool> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -615,19 +450,13 @@ fn cleanup_after_postgres_exit(
error!("error while checking for core dumps: {err:?}");
}
Ok(delay_exit)
}
fn maybe_delay_exit(delay_exit: bool) {
// If launch failed, keep serving HTTP requests for a while, so the cloud
// control plane can get the actual error.
if delay_exit {
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
}
}
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example:

View File

@@ -1,116 +0,0 @@
use compute_api::{
responses::CatalogObjects,
spec::{Database, Role},
};
use futures::Stream;
use postgres::{Client, NoTls};
use std::{path::Path, process::Stdio, result::Result, sync::Arc};
use tokio::{
io::{AsyncBufReadExt, BufReader},
process::Command,
task,
};
use tokio_stream::{self as stream, StreamExt};
use tokio_util::codec::{BytesCodec, FramedRead};
use tracing::warn;
use crate::{
compute::ComputeNode,
pg_helpers::{get_existing_dbs, get_existing_roles},
};
pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
let connstr = compute.connstr.clone();
task::spawn_blocking(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?;
let roles: Vec<Role>;
{
let mut xact = client.transaction()?;
roles = get_existing_roles(&mut xact)?;
}
let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
Ok(CatalogObjects { roles, databases })
})
.await?
}
#[derive(Debug, thiserror::Error)]
pub enum SchemaDumpError {
#[error("Database does not exist.")]
DatabaseDoesNotExist,
#[error("Failed to execute pg_dump.")]
IO(#[from] std::io::Error),
}
// It uses the pg_dump utility to dump the schema of the specified database.
// The output is streamed back to the caller and supposed to be streamed via HTTP.
//
// Before return the result with the output, it checks that pg_dump produced any output.
// If not, it tries to parse the stderr output to determine if the database does not exist
// and special error is returned.
//
// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
pub async fn get_database_schema(
compute: &Arc<ComputeNode>,
dbname: &str,
) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
let pgbin = &compute.pgbin;
let basepath = Path::new(pgbin).parent().unwrap();
let pgdump = basepath.join("pg_dump");
let mut connstr = compute.connstr.clone();
connstr.set_path(dbname);
let mut cmd = Command::new(pgdump)
.arg("--schema-only")
.arg(connstr.as_str())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.kill_on_drop(true)
.spawn()?;
let stdout = cmd.stdout.take().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
})?;
let stderr = cmd.stderr.take().ok_or_else(|| {
std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
})?;
let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
let stderr_reader = BufReader::new(stderr);
let first_chunk = match stdout_reader.next().await {
Some(Ok(bytes)) if !bytes.is_empty() => bytes,
Some(Err(e)) => {
return Err(SchemaDumpError::IO(e));
}
_ => {
let mut lines = stderr_reader.lines();
if let Some(line) = lines.next_line().await? {
if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) {
return Err(SchemaDumpError::DatabaseDoesNotExist);
}
warn!("pg_dump stderr: {}", line)
}
tokio::spawn(async move {
while let Ok(Some(line)) = lines.next_line().await {
warn!("pg_dump stderr: {}", line)
}
});
return Err(SchemaDumpError::IO(std::io::Error::new(
std::io::ErrorKind::Other,
"failed to start pg_dump",
)));
}
};
let initial_stream = stream::once(Ok(first_chunk.freeze()));
// Consume stderr and log warnings
tokio::spawn(async move {
let mut lines = stderr_reader.lines();
while let Ok(Some(line)) = lines.next_line().await {
warn!("pg_dump stderr: {}", line)
}
});
Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
}

View File

@@ -5,21 +5,17 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
use crate::catalog::SchemaDumpError;
use crate::catalog::{get_database_schema, get_dbs_and_roles};
use crate::compute::forward_termination_signal;
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_api::requests::ConfigurationRequest;
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
use anyhow::Result;
use hyper::header::CONTENT_TYPE;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use tokio::task;
use tracing::{error, info, warn};
use tracing_utils::http::OtelName;
use utils::http::request::must_get_query_param;
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
ComputeStatusResponse {
@@ -137,34 +133,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
(&Method::GET, "/dbs_and_roles") => {
info!("serving /dbs_and_roles GET request",);
match get_dbs_and_roles(compute).await {
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
Err(_) => {
render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
(&Method::GET, "/database_schema") => {
let database = match must_get_query_param(&req, "database") {
Err(e) => return e.into_response(),
Ok(database) => database,
};
info!("serving /database_schema GET request with database: {database}",);
match get_database_schema(compute, &database).await {
Ok(res) => render_plain(Body::wrap_stream(res)),
Err(SchemaDumpError::DatabaseDoesNotExist) => {
render_json_error("database does not exist", StatusCode::NOT_FOUND)
}
Err(e) => {
error!("can't get schema dump: {}", e);
render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// download extension files from remote extension storage on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
@@ -335,25 +303,10 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
};
Response::builder()
.status(status)
.header(CONTENT_TYPE, "application/json")
.body(Body::from(serde_json::to_string(&error).unwrap()))
.unwrap()
}
fn render_json(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "application/json")
.body(body)
.unwrap()
}
fn render_plain(body: Body) -> Response<Body> {
Response::builder()
.header(CONTENT_TYPE, "text/plain")
.body(body)
.unwrap()
}
async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
{
let mut state = compute.state.lock().unwrap();

View File

@@ -68,51 +68,6 @@ paths:
schema:
$ref: "#/components/schemas/Info"
/dbs_and_roles:
get:
tags:
- Info
summary: Get databases and roles in the catalog.
description: ""
operationId: getDbsAndRoles
responses:
200:
description: Compute schema objects
content:
application/json:
schema:
$ref: "#/components/schemas/DbsAndRoles"
/database_schema:
get:
tags:
- Info
summary: Get schema dump
parameters:
- name: database
in: query
description: Database name to dump.
required: true
schema:
type: string
example: "postgres"
description: Get schema dump in SQL format.
operationId: getDatabaseSchema
responses:
200:
description: Schema dump
content:
text/plain:
schema:
type: string
description: Schema dump in SQL format.
404:
description: Non existing database.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/check_writability:
post:
tags:
@@ -274,73 +229,6 @@ components:
num_cpus:
type: integer
DbsAndRoles:
type: object
description: Databases and Roles
required:
- roles
- databases
properties:
roles:
type: array
items:
$ref: "#/components/schemas/Role"
databases:
type: array
items:
$ref: "#/components/schemas/Database"
Database:
type: object
description: Database
required:
- name
- owner
- restrict_conn
- invalid
properties:
name:
type: string
owner:
type: string
options:
type: array
items:
$ref: "#/components/schemas/GenericOption"
restrict_conn:
type: boolean
invalid:
type: boolean
Role:
type: object
description: Role
required:
- name
properties:
name:
type: string
encrypted_password:
type: string
options:
type: array
items:
$ref: "#/components/schemas/GenericOption"
GenericOption:
type: object
description: Schema Generic option
required:
- name
- vartype
properties:
name:
type: string
value:
type: string
vartype:
type: string
ComputeState:
type: object
required:

View File

@@ -8,7 +8,6 @@ pub mod configurator;
pub mod http;
#[macro_use]
pub mod logger;
pub mod catalog;
pub mod compute;
pub mod extension_server;
pub mod monitor;

View File

@@ -9,11 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
use compute_api::spec::ComputeMode;
use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::{
InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
SafekeeperConf,
};
use control_plane::pageserver::PageServerNode;
use control_plane::local_env::{InitForceMode, LocalEnv};
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env};
@@ -55,6 +52,44 @@ const DEFAULT_PG_VERSION: &str = "15";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
fn default_conf(num_pageservers: u16) -> String {
let mut template = format!(
r#"
# Default built-in configuration, defined in main.rs
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
[broker]
listen_addr = '{DEFAULT_BROKER_ADDR}'
[[safekeepers]]
id = {DEFAULT_SAFEKEEPER_ID}
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
"#,
);
for i in 0..num_pageservers {
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
template += &format!(
r#"
[[pageservers]]
id = {pageserver_id}
listen_pg_addr = '127.0.0.1:{pg_port}'
listen_http_addr = '127.0.0.1:{http_port}'
pg_auth_type = '{trust_auth}'
http_auth_type = '{trust_auth}'
"#,
trust_auth = AuthType::Trust,
)
}
template
}
///
/// Timelines tree element used as a value in the HashMap.
///
@@ -117,7 +152,7 @@ fn main() -> Result<()> {
};
match subcommand_result {
Ok(Some(updated_env)) => updated_env.persist_config()?,
Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
Ok(None) => (),
Err(e) => {
eprintln!("command failed: {e:?}");
@@ -306,65 +341,55 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
}
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let num_pageservers = init_match.get_one::<u16>("num-pageservers");
let force = init_match.get_one("force").expect("we set a default value");
// Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
let init_conf: NeonLocalInitConf = if let Some(config_path) =
init_match.get_one::<PathBuf>("config")
{
// User (likely the Python test suite) provided a description of the environment.
if num_pageservers.is_some() {
bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
}
let num_pageservers = init_match
.get_one::<u16>("num-pageservers")
.expect("num-pageservers arg has a default");
// Create config file
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
// load and parse the file
let contents = std::fs::read_to_string(config_path).with_context(|| {
std::fs::read_to_string(config_path).with_context(|| {
format!(
"Could not read configuration file '{}'",
config_path.display()
)
})?;
toml_edit::de::from_str(&contents)?
})?
} else {
// User (likely interactive) did not provide a description of the environment, give them the default
NeonLocalInitConf {
control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
broker: NeonBroker {
listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
},
safekeepers: vec![SafekeeperConf {
id: DEFAULT_SAFEKEEPER_ID,
pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
..Default::default()
}],
pageservers: (0..num_pageservers.copied().unwrap_or(1))
.map(|i| {
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
NeonLocalInitPageserverConf {
id: pageserver_id,
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
listen_http_addr: format!("127.0.0.1:{http_port}"),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
}
})
.collect(),
pg_distrib_dir: None,
neon_distrib_dir: None,
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_compute_hook_api: None,
}
// Built-in default config
default_conf(*num_pageservers)
};
LocalEnv::init(init_conf, force)
.context("materialize initial neon_local environment on disk")?;
Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
let pageserver_config: toml_edit::Document =
if let Some(path) = init_match.get_one::<PathBuf>("pageserver-config") {
std::fs::read_to_string(path)?.parse()?
} else {
toml_edit::Document::new()
};
let pg_version = init_match
.get_one::<u32>("pg-version")
.copied()
.context("Failed to parse postgres version from the argument string")?;
let mut env =
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
let force = init_match.get_one("force").expect("we set a default value");
env.init(pg_version, force)
.context("Failed to initialize neon repository")?;
// Create remote storage location for default LocalFs remote storage
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
// Initialize pageserver, create initial tenant and timeline.
for ps_conf in &env.pageservers {
PageServerNode::from_env(&env, ps_conf)
.initialize(&pageserver_config)
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
exit(1);
});
}
Ok(env)
}
/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -1393,7 +1418,9 @@ fn cli() -> Command {
let num_pageservers_arg = Arg::new("num-pageservers")
.value_parser(value_parser!(u16))
.long("num-pageservers")
.help("How many pageservers to create (default 1)");
.help("How many pageservers to create (default 1)")
.required(false)
.default_value("1");
let update_catalog = Arg::new("update-catalog")
.value_parser(value_parser!(bool))
@@ -1427,6 +1454,14 @@ fn cli() -> Command {
.value_parser(value_parser!(PathBuf))
.value_name("config")
)
.arg(
Arg::new("pageserver-config")
.long("pageserver-config")
.required(false)
.value_parser(value_parser!(PathBuf))
.value_name("pageserver-config")
.help("Merge the provided pageserver config into the one generated by neon_local."),
)
.arg(pg_version_arg.clone())
.arg(force_arg)
)

View File

@@ -3,7 +3,7 @@
//! Now it also provides init method which acts like a stub for proper installation
//! script which will use local paths.
use anyhow::{bail, Context};
use anyhow::{bail, ensure, Context};
use clap::ValueEnum;
use postgres_backend::AuthType;
@@ -23,8 +23,6 @@ use utils::{
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
};
use crate::pageserver::PageServerNode;
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
use crate::safekeeper::SafekeeperNode;
pub const DEFAULT_PG_VERSION: u32 = 15;
@@ -36,7 +34,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
// an example.
//
#[derive(PartialEq, Eq, Clone, Debug)]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and
// compute endpoints).
@@ -44,99 +42,59 @@ pub struct LocalEnv {
// This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the NEON_REPO_DIR env variable or
// '.neon' if not given.
#[serde(skip)]
pub base_data_dir: PathBuf,
// Path to postgres distribution. It's expected that "bin", "include",
// "lib", "share" from postgres distribution are there. If at some point
// in time we will be able to run against vanilla postgres we may split that
// to four separate paths and match OS-specific installation layout.
#[serde(default)]
pub pg_distrib_dir: PathBuf,
// Path to pageserver binary.
#[serde(default)]
pub neon_distrib_dir: PathBuf,
// Default tenant ID to use with the 'neon_local' command line utility, when
// --tenant_id is not explicitly specified.
#[serde(default)]
pub default_tenant_id: Option<TenantId>,
// used to issue tokens during e.g pg start
#[serde(default)]
pub private_key_path: PathBuf,
pub broker: NeonBroker,
// Configuration for the storage controller (1 per neon_local environment)
#[serde(default)]
pub storage_controller: NeonStorageControllerConf,
/// This Vec must always contain at least one pageserver
/// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
/// NB: not used anymore except for informing users that they need to change their `.neon/config`.
pub pageservers: Vec<PageServerConf>,
#[serde(default)]
pub safekeepers: Vec<SafekeeperConf>,
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
// be propagated into each pageserver's configuration.
#[serde(default)]
pub control_plane_api: Option<Url>,
// Control plane upcall API for storage controller. If set, this will be propagated into the
// storage controller's configuration.
#[serde(default)]
pub control_plane_compute_hook_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
#[serde(default)]
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
}
/// On-disk state stored in `.neon/config`.
#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
#[serde(default, deny_unknown_fields)]
pub struct OnDiskConfig {
pub pg_distrib_dir: PathBuf,
pub neon_distrib_dir: PathBuf,
pub default_tenant_id: Option<TenantId>,
pub private_key_path: PathBuf,
pub broker: NeonBroker,
pub storage_controller: NeonStorageControllerConf,
#[serde(
skip_serializing,
deserialize_with = "fail_if_pageservers_field_specified"
)]
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
}
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
where
D: serde::Deserializer<'de>,
{
Err(serde::de::Error::custom(
"The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
Please remove the `pageservers` from your .neon/config.",
))
}
/// The description of the neon_local env to be initialized by `neon_local init --config`.
#[derive(Clone, Debug, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct NeonLocalInitConf {
// TODO: do we need this? Seems unused
pub pg_distrib_dir: Option<PathBuf>,
// TODO: do we need this? Seems unused
pub neon_distrib_dir: Option<PathBuf>,
pub default_tenant_id: TenantId,
pub broker: NeonBroker,
pub storage_controller: Option<NeonStorageControllerConf>,
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Option<Url>>,
pub control_plane_compute_hook_api: Option<Option<Url>>,
}
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
@@ -152,9 +110,6 @@ pub struct NeonStorageControllerConf {
/// Heartbeat timeout before marking a node offline
#[serde(with = "humantime_serde")]
pub max_unavailable: Duration,
/// Threshold for auto-splitting a tenant into shards
pub split_threshold: Option<u64>,
}
impl NeonStorageControllerConf {
@@ -167,7 +122,6 @@ impl Default for NeonStorageControllerConf {
fn default() -> Self {
Self {
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
split_threshold: None,
}
}
}
@@ -187,18 +141,24 @@ impl NeonBroker {
}
}
// neon_local needs to know this subset of pageserver configuration.
// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
// It can get stale if `pageserver.toml` is changed.
// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default, deny_unknown_fields)]
pub struct PageServerConf {
// node id
pub id: NodeId,
// Pageserver connection settings
pub listen_pg_addr: String,
pub listen_http_addr: String,
// auth type used for the PG and HTTP ports
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub(crate) virtual_file_io_engine: Option<String>,
pub(crate) get_vectored_impl: Option<String>,
pub(crate) get_impl: Option<String>,
pub(crate) validate_vectored_get: Option<bool>,
}
impl Default for PageServerConf {
@@ -209,40 +169,10 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
}
}
}
/// The toml that can be passed to `neon_local init --config`.
/// This is a subset of the `pageserver.toml` configuration.
// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
pub struct NeonLocalInitPageserverConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(flatten)]
pub other: HashMap<String, toml::Value>,
}
impl From<&NeonLocalInitPageserverConf> for PageServerConf {
fn from(conf: &NeonLocalInitPageserverConf) -> Self {
let NeonLocalInitPageserverConf {
id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
other: _,
} = conf;
Self {
id: *id,
listen_pg_addr: listen_pg_addr.clone(),
listen_http_addr: listen_http_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
virtual_file_io_engine: None,
get_vectored_impl: None,
get_impl: None,
validate_vectored_get: None,
}
}
}
@@ -430,7 +360,44 @@ impl LocalEnv {
.collect()
}
/// Construct `Self` from on-disk state.
/// Create a LocalEnv from a config file.
///
/// Unlike 'load_config', this function fills in any defaults that are missing
/// from the config file.
pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
let mut env: LocalEnv = toml::from_str(toml)?;
// Find postgres binaries.
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
// Note that later in the code we assume, that distrib dirs follow the same pattern
// for all postgres versions.
if env.pg_distrib_dir == Path::new("") {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
env.pg_distrib_dir = postgres_bin.into();
} else {
let cwd = env::current_dir()?;
env.pg_distrib_dir = cwd.join("pg_install")
}
}
// Find neon binaries.
if env.neon_distrib_dir == Path::new("") {
env::current_exe()?
.parent()
.unwrap()
.clone_into(&mut env.neon_distrib_dir);
}
if env.pageservers.is_empty() {
anyhow::bail!("Configuration must contain at least one pageserver");
}
env.base_data_dir = base_path();
Ok(env)
}
/// Locate and load config
pub fn load_config() -> anyhow::Result<Self> {
let repopath = base_path();
@@ -444,129 +411,38 @@ impl LocalEnv {
// TODO: check that it looks like a neon repository
// load and parse file
let config_file_contents = fs::read_to_string(repopath.join("config"))?;
let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?;
let mut env = {
let OnDiskConfig {
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
private_key_path,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
branch_name_mappings,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.clone(),
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
private_key_path,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
branch_name_mappings,
}
};
let config = fs::read_to_string(repopath.join("config"))?;
let mut env: LocalEnv = toml::from_str(config.as_str())?;
// The source of truth for pageserver configuration is the pageserver.toml.
assert!(
env.pageservers.is_empty(),
"we ensure this during deserialization"
);
env.pageservers = {
let iter = std::fs::read_dir(&repopath).context("open dir")?;
let mut pageservers = Vec::new();
for res in iter {
let dentry = res?;
const PREFIX: &str = "pageserver_";
let dentry_name = dentry
.file_name()
.into_string()
.ok()
.with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
.unwrap();
if !dentry_name.starts_with(PREFIX) {
continue;
}
if !dentry.file_type().context("determine file type")?.is_dir() {
anyhow::bail!("expected a directory, got {:?}", dentry.path());
}
let id = dentry_name[PREFIX.len()..]
.parse::<NodeId>()
.with_context(|| format!("parse id from {:?}", dentry.path()))?;
// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
#[derive(serde::Serialize, serde::Deserialize)]
// (allow unknown fields, unlike PageServerConf)
struct PageserverConfigTomlSubset {
id: NodeId,
listen_pg_addr: String,
listen_http_addr: String,
pg_auth_type: AuthType,
http_auth_type: AuthType,
}
let config_toml_path = dentry.path().join("pageserver.toml");
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
&std::fs::read_to_string(&config_toml_path)
.with_context(|| format!("read {:?}", config_toml_path))?,
)
.context("parse pageserver.toml")?;
let PageserverConfigTomlSubset {
id: config_toml_id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
} = config_toml;
let conf = PageServerConf {
id: {
anyhow::ensure!(
config_toml_id == id,
"id mismatch: config_toml.id={config_toml_id} id={id}",
);
id
},
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
};
pageservers.push(conf);
}
pageservers
};
env.base_data_dir = repopath;
Ok(env)
}
pub fn persist_config(&self) -> anyhow::Result<()> {
Self::persist_config_impl(
&self.base_data_dir,
&OnDiskConfig {
pg_distrib_dir: self.pg_distrib_dir.clone(),
neon_distrib_dir: self.neon_distrib_dir.clone(),
default_tenant_id: self.default_tenant_id,
private_key_path: self.private_key_path.clone(),
broker: self.broker.clone(),
storage_controller: self.storage_controller.clone(),
pageservers: vec![], // it's skip_serializing anyway
safekeepers: self.safekeepers.clone(),
control_plane_api: self.control_plane_api.clone(),
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
branch_name_mappings: self.branch_name_mappings.clone(),
},
)
}
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
// Currently, the user first passes a config file with 'neon_local init --config=<path>'
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .neon/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
let mut conf_content = r#"# This file describes a local deployment of the page server
# and safekeeeper node. It is read by the 'neon_local' command-line
# utility.
"#
.to_string();
// Convert the LocalEnv to a toml file.
//
// This could be as simple as this:
//
// conf_content += &toml::to_string_pretty(env)?;
//
// But it results in a "values must be emitted before tables". I'm not sure
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
// Maybe rust reorders the fields to squeeze avoid padding or something?
// In any case, converting to toml::Value first, and serializing that, works.
// See https://github.com/alexcrichton/toml-rs/issues/142
conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
let conf_content = &toml::to_string_pretty(config)?;
let target_config_path = base_path.join("config");
fs::write(&target_config_path, conf_content).with_context(|| {
format!(
@@ -591,13 +467,17 @@ impl LocalEnv {
}
}
/// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
let base_path = base_path();
assert_ne!(base_path, Path::new(""));
let base_path = &base_path;
//
// Initialize a new Neon repository
//
pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
// check if config already exists
let base_path = &self.base_data_dir;
ensure!(
base_path != Path::new(""),
"repository base path is missing"
);
// create base_path dir
if base_path.exists() {
match force {
InitForceMode::MustNotExist => {
@@ -629,96 +509,70 @@ impl LocalEnv {
}
}
}
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
bail!(
"Can't find postgres binary at {}",
self.pg_bin_dir(pg_version)?.display()
);
}
for binary in ["pageserver", "safekeeper"] {
if !self.neon_distrib_dir.join(binary).exists() {
bail!(
"Can't find binary '{binary}' in neon distrib dir '{}'",
self.neon_distrib_dir.display()
);
}
}
if !base_path.exists() {
fs::create_dir(base_path)?;
}
let NeonLocalInitConf {
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
} = conf;
// Find postgres binaries.
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
// Note that later in the code we assume, that distrib dirs follow the same pattern
// for all postgres versions.
let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
postgres_bin.into()
} else {
let cwd = env::current_dir().unwrap();
cwd.join("pg_install")
}
});
// Find neon binaries.
let neon_distrib_dir = neon_distrib_dir
.unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
// Generate keypair for JWT.
//
// The keypair is only needed if authentication is enabled in any of the
// components. For convenience, we generate the keypair even if authentication
// is not enabled, so that you can easily enable it after the initialization
// step.
generate_auth_keys(
base_path.join("auth_private_key.pem").as_path(),
base_path.join("auth_public_key.pem").as_path(),
)
.context("generate auth keys")?;
let private_key_path = PathBuf::from("auth_private_key.pem");
// create the runtime type because the remaining initialization code below needs
// a LocalEnv instance op operation
// TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state
let env = LocalEnv {
base_data_dir: base_path.clone(),
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id: Some(default_tenant_id),
private_key_path,
broker,
storage_controller: storage_controller.unwrap_or_default(),
pageservers: pageservers.iter().map(Into::into).collect(),
safekeepers,
control_plane_api: control_plane_api.unwrap_or_default(),
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
branch_name_mappings: Default::default(),
};
// create endpoints dir
fs::create_dir_all(env.endpoints_path())?;
// create safekeeper dirs
for safekeeper in &env.safekeepers {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
// step. However, if the key generation fails, we treat it as non-fatal if
// authentication was not enabled.
if self.private_key_path == PathBuf::new() {
match generate_auth_keys(
base_path.join("auth_private_key.pem").as_path(),
base_path.join("auth_public_key.pem").as_path(),
) {
Ok(()) => {
self.private_key_path = PathBuf::from("auth_private_key.pem");
}
Err(e) => {
if !self.auth_keys_needed() {
eprintln!("Could not generate keypair for JWT authentication: {e}");
eprintln!("Continuing anyway because authentication was not enabled");
self.private_key_path = PathBuf::from("auth_private_key.pem");
} else {
return Err(e);
}
}
}
}
// initialize pageserver state
for (i, ps) in pageservers.into_iter().enumerate() {
let runtime_ps = &env.pageservers[i];
assert_eq!(&PageServerConf::from(&ps), runtime_ps);
fs::create_dir(env.pageserver_data_dir(ps.id))?;
PageServerNode::from_env(&env, runtime_ps)
.initialize(ps)
.context("pageserver init failed")?;
fs::create_dir_all(self.endpoints_path())?;
for safekeeper in &self.safekeepers {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
}
// setup remote remote location for default LocalFs remote storage
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
self.persist_config(base_path)
}
env.persist_config()
fn auth_keys_needed(&self) -> bool {
self.pageservers.iter().any(|ps| {
ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
}) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
}
}
pub fn base_path() -> PathBuf {
fn base_path() -> PathBuf {
match std::env::var_os("NEON_REPO_DIR") {
Some(val) => PathBuf::from(val),
None => PathBuf::from(".neon"),
@@ -761,3 +615,31 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple_conf_parsing() {
let simple_conf_toml = include_str!("../simple.conf");
let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
assert!(
simple_conf_parse_result.is_ok(),
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
);
let string_to_replace = "listen_addr = '127.0.0.1:50051'";
let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
assert!(
spoiled_url_toml.contains(spoiled_url_str),
"Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
);
let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
assert!(
spoiled_url_parse_result.is_err(),
"expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
);
}
}

View File

@@ -10,7 +10,7 @@ use std::io;
use std::io::Write;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::str::FromStr;
use std::process::Command;
use std::time::Duration;
use anyhow::{bail, Context};
@@ -30,7 +30,7 @@ use utils::{
lsn::Lsn,
};
use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
/// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -74,23 +74,71 @@ impl PageServerNode {
}
}
fn pageserver_init_make_toml(
&self,
conf: NeonLocalInitPageserverConf,
) -> anyhow::Result<toml_edit::Document> {
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
/// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
///
/// These all end up on the command line of the `pageserver` binary.
fn neon_local_overrides(&self, cli_overrides: &toml_edit::Document) -> Vec<String> {
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!(
"pg_distrib_dir='{}'",
self.env.pg_distrib_dir_raw().display()
);
let PageServerConf {
id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
virtual_file_io_engine,
get_vectored_impl,
get_impl,
validate_vectored_get,
} = &self.conf;
let id = format!("id={}", id);
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
} else {
String::new()
};
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
format!("get_vectored_impl='{get_vectored_impl}'")
} else {
String::new()
};
let get_impl = if let Some(get_impl) = get_impl {
format!("get_impl='{get_impl}'")
} else {
String::new()
};
let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
format!("validate_vectored_get={validate_vectored_get}")
} else {
String::new()
};
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
let mut overrides = vec![
id,
pg_distrib_dir_param,
http_auth_type_param,
pg_auth_type_param,
listen_http_addr_param,
listen_pg_addr_param,
broker_endpoint_param,
virtual_file_io_engine,
get_vectored_impl,
get_impl,
validate_vectored_get,
];
if let Some(control_plane_api) = &self.env.control_plane_api {
overrides.push(format!(
@@ -100,7 +148,7 @@ impl PageServerNode {
// Storage controller uses the same auth as pageserver: if JWT is enabled
// for us, we will also need it to talk to them.
if matches!(conf.http_auth_type, AuthType::NeonJWT) {
if matches!(http_auth_type, AuthType::NeonJWT) {
let jwt_token = self
.env
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -109,40 +157,28 @@ impl PageServerNode {
}
}
if !conf.other.contains_key("remote_storage") {
if !cli_overrides.contains_key("remote_storage") {
overrides.push(format!(
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
));
}
if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
// Keys are generated in the toplevel repo dir, pageservers' workdirs
// are one level below that, so refer to keys with ../
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
}
// Apply the user-provided overrides
overrides.push(
toml_edit::ser::to_string_pretty(&conf)
.expect("we deserialized this from toml earlier"),
);
overrides.push(cli_overrides.to_string());
// Turn `overrides` into a toml document.
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
let mut config_toml = toml_edit::Document::new();
for fragment_str in overrides {
let fragment = toml_edit::Document::from_str(&fragment_str)
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
for (key, item) in fragment.iter() {
config_toml.insert(key, item.clone());
}
}
Ok(config_toml)
overrides
}
/// Initializes a pageserver node by creating its config with the overrides provided.
pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
self.pageserver_init(conf)
pub fn initialize(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides)
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
}
@@ -162,7 +198,7 @@ impl PageServerNode {
self.start_node().await
}
fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
fn pageserver_init(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
let datadir = self.repo_path();
let node_id = self.conf.id;
println!(
@@ -173,20 +209,36 @@ impl PageServerNode {
);
io::stdout().flush()?;
let config = self
.pageserver_init_make_toml(conf)
.context("make pageserver toml")?;
let config_file_path = datadir.join("pageserver.toml");
let mut config_file = std::fs::OpenOptions::new()
.create_new(true)
.write(true)
.open(&config_file_path)
.with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?;
config_file
.write_all(config.to_string().as_bytes())
.context("write pageserver toml")?;
drop(config_file);
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
if !datadir.exists() {
std::fs::create_dir(&datadir)?;
}
let datadir_path_str = datadir.to_str().with_context(|| {
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
})?;
// `pageserver --init` merges the `--config-override`s into a built-in default config,
// then writes out the merged product to `pageserver.toml`.
// TODO: just write the full `pageserver.toml` and get rid of `--config-override`.
let mut args = vec!["--init", "--workdir", datadir_path_str];
let overrides = self.neon_local_overrides(config_overrides);
for piece in &overrides {
args.push("--config-override");
args.push(piece);
}
let init_output = Command::new(self.env.pageserver_bin())
.args(args)
.envs(self.pageserver_env_variables()?)
.output()
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
anyhow::ensure!(
init_output.status.success(),
"Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
node_id,
String::from_utf8_lossy(&init_output.stdout),
String::from_utf8_lossy(&init_output.stderr),
);
// Write metadata file, used by pageserver on startup to register itself with
// the storage controller

View File

@@ -305,10 +305,6 @@ impl StorageController {
));
}
if let Some(split_threshold) = self.config.split_threshold.as_ref() {
args.push(format!("--split-threshold={split_threshold}"))
}
background_process::start_process(
COMMAND,
&self.env.base_data_dir,

View File

@@ -3,7 +3,7 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize, Serializer};
use crate::spec::{ComputeSpec, Database, Role};
use crate::spec::ComputeSpec;
#[derive(Serialize, Debug, Deserialize)]
pub struct GenericAPIError {
@@ -113,12 +113,6 @@ pub struct ComputeMetrics {
pub total_ext_download_size: u64,
}
#[derive(Clone, Debug, Default, Serialize)]
pub struct CatalogObjects {
pub roles: Vec<Role>,
pub databases: Vec<Database>,
}
/// Response of the `/computes/{compute_id}/spec` control-plane API.
/// This is not actually a compute API response, so consider moving
/// to a different place.

View File

@@ -10,7 +10,6 @@ use std::{
io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize},
str::FromStr,
sync::atomic::AtomicUsize,
time::{Duration, SystemTime},
};
@@ -309,88 +308,13 @@ pub struct TenantConfig {
pub switch_aux_file_policy: Option<AuxFilePolicy>,
}
/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
/// tenant config. When the first aux file written, the policy will be persisted in the
/// `index_part.json` file and has a limited migration path.
///
/// Currently, we only allow the following migration path:
///
/// Unset -> V1
/// -> V2
/// -> CrossValidation -> V2
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AuxFilePolicy {
/// V1 aux file policy: store everything in AUX_FILE_KEY
V1,
/// V2 aux file policy: store in the AUX_FILE keyspace
V2,
/// Cross validation runs both formats on the write path and does validation
/// on the read path.
CrossValidation,
}
impl AuxFilePolicy {
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
matches!(
(from, to),
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
)
}
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
pub fn default_tenant_config() -> Self {
Self::V1
}
}
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
pub struct AtomicAuxFilePolicy(AtomicUsize);
impl AtomicAuxFilePolicy {
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
Self(AtomicUsize::new(
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
))
}
pub fn load(&self) -> Option<AuxFilePolicy> {
match self.0.load(std::sync::atomic::Ordering::Acquire) {
0 => None,
other => Some(AuxFilePolicy::from_usize(other)),
}
}
pub fn store(&self, policy: Option<AuxFilePolicy>) {
self.0.store(
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
std::sync::atomic::Ordering::Release,
);
}
}
impl AuxFilePolicy {
pub fn to_usize(self) -> usize {
match self {
Self::V1 => 1,
Self::CrossValidation => 2,
Self::V2 => 3,
}
}
pub fn try_from_usize(this: usize) -> Option<Self> {
match this {
1 => Some(Self::V1),
2 => Some(Self::CrossValidation),
3 => Some(Self::V2),
_ => None,
}
}
pub fn from_usize(this: usize) -> Self {
Self::try_from_usize(this).unwrap()
}
}
impl FromStr for AuxFilePolicy {
type Err = anyhow::Error;
@@ -680,9 +604,6 @@ pub struct TimelineInfo {
pub state: TimelineState,
pub walreceiver_status: String,
/// The last aux file policy being used on this timeline
pub last_aux_file_policy: Option<AuxFilePolicy>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -824,16 +745,6 @@ impl HistoricLayerInfo {
};
*field = value;
}
pub fn layer_file_size(&self) -> u64 {
match self {
HistoricLayerInfo::Delta {
layer_file_size, ..
} => *layer_file_size,
HistoricLayerInfo::Image {
layer_file_size, ..
} => *layer_file_size,
}
}
}
#[derive(Debug, Serialize, Deserialize)]
@@ -841,16 +752,6 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
pub max_concurrent_downloads: NonZeroUsize,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct IngestAuxFilesRequest {
pub aux_files: HashMap<String, String>,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct ListAuxFilesRequest {
pub lsn: Lsn,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DownloadRemoteLayersTaskInfo {
pub task_id: String,
@@ -875,6 +776,9 @@ pub struct TimelineGcRequest {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerProcessStatus {
pub pid: u32,
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
pub kind: Cow<'static, str>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -913,55 +817,6 @@ pub struct TenantScanRemoteStorageResponse {
pub shards: Vec<TenantScanRemoteStorageShard>,
}
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "snake_case")]
pub enum TenantSorting {
ResidentSize,
MaxLogicalSize,
}
impl Default for TenantSorting {
fn default() -> Self {
Self::ResidentSize
}
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct TopTenantShardsRequest {
// How would you like to sort the tenants?
pub order_by: TenantSorting,
// How many results?
pub limit: usize,
// Omit tenants with more than this many shards (e.g. if this is the max number of shards
// that the caller would ever split to)
pub where_shards_lt: Option<ShardCount>,
// Omit tenants where the ordering metric is less than this (this is an optimization to
// let us quickly exclude numerous tiny shards)
pub where_gt: Option<u64>,
}
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
pub struct TopTenantShardItem {
pub id: TenantShardId,
/// Total size of layers on local disk for all timelines in this tenant
pub resident_size: u64,
/// Total size of layers in remote storage for all timelines in this tenant
pub physical_size: u64,
/// The largest logical size of a timeline within this tenant
pub max_logical_size: u64,
}
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TopTenantShardsResponse {
pub shards: Vec<TopTenantShardItem>,
}
pub mod virtual_file {
#[derive(
Copy,
@@ -1594,59 +1449,4 @@ mod tests {
assert_eq!(actual, expected, "example on {line}");
}
}
#[test]
fn test_aux_file_migration_path() {
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::V1
));
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::V2
));
assert!(AuxFilePolicy::is_valid_migration_path(
None,
AuxFilePolicy::CrossValidation
));
// Self-migration is not a valid migration path, and the caller should handle it by itself.
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::V2
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::CrossValidation
));
// Migrations not allowed
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::V2
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::V1
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V2),
AuxFilePolicy::CrossValidation
));
assert!(!AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::V1),
AuxFilePolicy::CrossValidation
));
// Migrations allowed
assert!(AuxFilePolicy::is_valid_migration_path(
Some(AuxFilePolicy::CrossValidation),
AuxFilePolicy::V2
));
}
}

View File

@@ -125,7 +125,7 @@ impl ShardCount {
/// `v` may be zero, or the number of shards in the tenant. `v` is what
/// [`Self::literal`] would return.
pub const fn new(val: u8) -> Self {
pub fn new(val: u8) -> Self {
Self(val)
}
}

View File

@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
// Is there enough space on the page for another logical message and an
// XLOG_SWITCH? If not, start over.
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
continue;
}

View File

@@ -29,7 +29,6 @@ use http_types::{StatusCode, Url};
use tokio_util::sync::CancellationToken;
use tracing::debug;
use crate::RemoteStorageActivity;
use crate::{
error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -526,10 +525,6 @@ impl RemoteStorage for AzureBlobStorage {
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
Err(TimeTravelError::Unimplemented)
}
fn activity(&self) -> RemoteStorageActivity {
self.concurrency_limiter.activity()
}
}
pin_project_lite::pin_project! {

View File

@@ -263,17 +263,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
done_if_after: SystemTime,
cancel: &CancellationToken,
) -> Result<(), TimeTravelError>;
/// Query how busy we currently are: may be used by callers which wish to politely
/// back off if there are already a lot of operations underway.
fn activity(&self) -> RemoteStorageActivity;
}
pub struct RemoteStorageActivity {
pub read_available: usize,
pub read_total: usize,
pub write_available: usize,
pub write_total: usize,
}
/// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -455,15 +444,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
}
pub fn activity(&self) -> RemoteStorageActivity {
match self {
Self::LocalFs(s) => s.activity(),
Self::AwsS3(s) => s.activity(),
Self::AzureBlob(s) => s.activity(),
Self::Unreliable(s) => s.activity(),
}
}
}
impl GenericRemoteStorage {
@@ -794,9 +774,6 @@ struct ConcurrencyLimiter {
// The helps to ensure we don't exceed the thresholds.
write: Arc<Semaphore>,
read: Arc<Semaphore>,
write_total: usize,
read_total: usize,
}
impl ConcurrencyLimiter {
@@ -825,21 +802,10 @@ impl ConcurrencyLimiter {
Arc::clone(self.for_kind(kind)).acquire_owned().await
}
fn activity(&self) -> RemoteStorageActivity {
RemoteStorageActivity {
read_available: self.read.available_permits(),
read_total: self.read_total,
write_available: self.write.available_permits(),
write_total: self.write_total,
}
}
fn new(limit: usize) -> ConcurrencyLimiter {
Self {
read: Arc::new(Semaphore::new(limit)),
write: Arc::new(Semaphore::new(limit)),
read_total: limit,
write_total: limit,
}
}
}

View File

@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
use utils::crashsafe::path_with_suffix_extension;
use crate::{
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
REMOTE_STORAGE_PREFIX_SEPARATOR,
};
use super::{RemoteStorage, StorageMetadata};
@@ -605,16 +605,6 @@ impl RemoteStorage for LocalFs {
) -> Result<(), TimeTravelError> {
Err(TimeTravelError::Unimplemented)
}
fn activity(&self) -> RemoteStorageActivity {
// LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
RemoteStorageActivity {
read_available: 16,
read_total: 16,
write_available: 16,
write_total: 16,
}
}
}
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {

View File

@@ -27,7 +27,7 @@ use aws_config::{
};
use aws_credential_types::provider::SharedCredentialsProvider;
use aws_sdk_s3::{
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
@@ -47,8 +47,8 @@ use utils::backoff;
use super::StorageMetadata;
use crate::{
error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
pub(super) mod metrics;
@@ -75,13 +75,13 @@ struct GetObjectRequest {
}
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
tracing::debug!(
"Creating s3 remote storage for S3 bucket {}",
remote_storage_config.bucket_name
aws_config.bucket_name
);
let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
let region = Some(Region::new(aws_config.bucket_region.clone()));
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
@@ -113,38 +113,6 @@ impl S3Bucket {
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
#[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
BehaviorVersion::v2023_11_09(),
)
.region(region)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
s.spawn(|| {
// TODO: make this function async.
tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap()
.block_on(sdk_config_loader.load())
})
.join()
.unwrap()
});
let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
// Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
// (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
s3_config_builder = s3_config_builder
.endpoint_url(custom_endpoint)
.force_path_style(true);
}
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
@@ -152,36 +120,42 @@ impl S3Bucket {
retry_config
.set_max_attempts(Some(1))
.set_mode(Some(RetryMode::Adaptive));
s3_config_builder = s3_config_builder.retry_config(retry_config.build());
let s3_config = s3_config_builder.build();
let client = aws_sdk_s3::Client::from_conf(s3_config);
let mut config_builder = Builder::default()
.behavior_version(BehaviorVersion::v2023_11_09())
.region(region)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.retry_config(retry_config.build())
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
let prefix_in_bucket = remote_storage_config
.prefix_in_bucket
.as_deref()
.map(|prefix| {
let mut prefix = prefix;
while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix = &prefix[1..]
}
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder
.endpoint_url(custom_endpoint)
.force_path_style(true);
}
let mut prefix = prefix.to_string();
while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix.pop();
}
prefix
});
let client = Client::from_conf(config_builder.build());
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
let mut prefix = prefix;
while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix = &prefix[1..]
}
let mut prefix = prefix.to_string();
while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix.pop();
}
prefix
});
Ok(Self {
client,
bucket_name: remote_storage_config.bucket_name.clone(),
max_keys_per_list_response: remote_storage_config.max_keys_per_list_response,
bucket_name: aws_config.bucket_name.clone(),
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
concurrency_limiter: ConcurrencyLimiter::new(
remote_storage_config.concurrency_limit.get(),
),
upload_storage_class: remote_storage_config.upload_storage_class.clone(),
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
upload_storage_class: aws_config.upload_storage_class.clone(),
timeout,
})
}
@@ -975,10 +949,6 @@ impl RemoteStorage for S3Bucket {
}
Ok(())
}
fn activity(&self) -> RemoteStorageActivity {
self.concurrency_limiter.activity()
}
}
/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].

View File

@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
use crate::{
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
RemoteStorageActivity, StorageMetadata, TimeTravelError,
StorageMetadata, TimeTravelError,
};
pub struct UnreliableWrapper {
@@ -213,8 +213,4 @@ impl RemoteStorage for UnreliableWrapper {
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
.await
}
fn activity(&self) -> RemoteStorageActivity {
self.inner.activity()
}
}

View File

@@ -3,7 +3,7 @@
//! # Example
//!
//! ```
//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async {
//! # tokio_test::block_on(async {
//! use utils::poison::Poison;
//! use std::time::Duration;
//!

View File

@@ -88,6 +88,7 @@ reqwest.workspace = true
rpds.workspace = true
enum-map.workspace = true
enumset = { workspace = true, features = ["serde"]}
sha2.workspace = true
strum.workspace = true
strum_macros.workspace = true

View File

@@ -1,7 +1,7 @@
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::storage_layer::PersistentLayerDesc;
use pageserver_api::shard::TenantShardId;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
@@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
let mut updates = layer_map.batch_update();
for fname in filenames {
let fname = fname.unwrap();
let fname = LayerName::from_str(&fname).unwrap();
let fname = LayerFileName::from_str(&fname).unwrap();
let layer = PersistentLayerDesc::from(fname);
let lsn_range = layer.get_lsn_range();

View File

@@ -30,27 +30,47 @@
//! 2024-04-15 on i3en.3xlarge
//!
//! ```text
//! short/1 time: [24.584 µs 24.737 µs 24.922 µs]
//! short/2 time: [33.479 µs 33.660 µs 33.888 µs]
//! short/4 time: [42.713 µs 43.046 µs 43.440 µs]
//! short/8 time: [71.814 µs 72.478 µs 73.240 µs]
//! short/16 time: [132.73 µs 134.45 µs 136.22 µs]
//! short/32 time: [258.31 µs 260.73 µs 263.27 µs]
//! short/64 time: [511.61 µs 514.44 µs 517.51 µs]
//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
//! ```
use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion};
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
use pageserver::{
config::PageServerConf,
walrecord::NeonWalRecord,
walredo::{PostgresRedoManager, ProcessKind},
};
use pageserver_api::{key::Key, shard::TenantShardId};
use std::{
sync::Arc,
@@ -60,32 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
use utils::{id::TenantId, lsn::Lsn};
fn bench(c: &mut Criterion) {
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("short");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::short_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group(format!("{process_kind}-short"));
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::short_input());
b.iter_custom(|iters| {
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
});
},
);
}
}
}
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("medium");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| {
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
});
},
);
}
}
}
}
@@ -93,10 +120,16 @@ criterion::criterion_group!(benches, bench);
criterion::criterion_main!(benches);
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
fn bench_impl(
process_kind: ProcessKind,
redo_work: Arc<Request>,
n_redos: u64,
nclients: u64,
) -> Duration {
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
conf.walredo_process_kind = process_kind;
let conf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
@@ -125,13 +158,27 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
});
}
rt.block_on(async move {
let elapsed = rt.block_on(async move {
let mut total_wallclock_time = Duration::ZERO;
while let Some(res) = tasks.join_next().await {
total_wallclock_time += res.unwrap();
}
total_wallclock_time
})
});
// consistency check to ensure process kind setting worked
if nredos_per_client > 0 {
assert_eq!(
manager
.status()
.process
.map(|p| p.kind)
.expect("the benchmark work causes a walredo process to be spawned"),
std::borrow::Cow::Borrowed(process_kind.into())
);
}
elapsed
}
async fn client(

View File

@@ -1,12 +1,8 @@
use std::collections::HashMap;
use bytes::Bytes;
use pageserver_api::{models::*, shard::TenantShardId};
use reqwest::{IntoUrl, Method, StatusCode};
use utils::{
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
lsn::Lsn,
};
pub mod util;
@@ -490,18 +486,6 @@ impl Client {
.map_err(Error::ReceiveBody)
}
pub async fn top_tenant_shards(
&self,
request: TopTenantShardsRequest,
) -> Result<TopTenantShardsResponse> {
let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
self.request(Method::POST, uri, request)
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn layer_map_info(
&self,
tenant_shard_id: TenantShardId,
@@ -565,57 +549,4 @@ impl Client {
}),
}
}
pub async fn ingest_aux_files(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
aux_files: HashMap<String, String>,
) -> Result<bool> {
let uri = format!(
"{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
);
let resp = self
.request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
.await?;
match resp.status() {
StatusCode::OK => Ok(true),
status => Err(match resp.json::<HttpErrorBody>().await {
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
Err(_) => {
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
}
}),
}
}
pub async fn list_aux_files(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<HashMap<String, Bytes>> {
let uri = format!(
"{}/v1/tenant/{}/timeline/{}/list_aux_files",
self.mgmt_api_endpoint, tenant_shard_id, timeline_id
);
let resp = self
.request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
.await?;
match resp.status() {
StatusCode::OK => {
let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
})?;
Ok(resp)
}
status => Err(match resp.json::<HttpErrorBody>().await {
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
Err(_) => {
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
}
}),
}
}
}

View File

@@ -1,5 +1,4 @@
use clap::{Parser, Subcommand};
use pageserver_compaction::helpers::PAGE_SZ;
use pageserver_compaction::simulator::MockTimeline;
use rand::Rng;
use std::io::Write;
@@ -52,7 +51,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
let mut executor = MockTimeline::new();
// Convert the logical size in MB into a key range.
let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
//let key_range = u64::MIN..u64::MAX;
println!(
"starting simulation with key range {:016X}-{:016X}",

View File

@@ -24,9 +24,7 @@ use tracing::{debug, info};
use std::collections::{HashSet, VecDeque};
use std::ops::Range;
use crate::helpers::{
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
};
use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
use crate::interface::*;
use utils::lsn::Lsn;
@@ -106,13 +104,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
ctx,
)
.await?;
if current_level_target_height == u64::MAX {
// our target height includes all possible lsns
info!(
level = current_level_no,
depth = depth,
"compaction loop reached max current_level_target_height"
);
if target_file_size == u64::MAX {
break;
}
current_level_no += 1;
@@ -379,7 +371,7 @@ where
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
.await?,
&self.shard_identity,
) * PAGE_SZ;
) * 8192;
let wal_size = job
.input_layers
@@ -441,7 +433,7 @@ where
let mut window = KeyspaceWindow::new(
E::Key::MIN..E::Key::MAX,
keyspace,
self.target_file_size / PAGE_SZ,
self.target_file_size / 8192,
);
while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
new_jobs.push(CompactionJob::<E> {
@@ -530,6 +522,8 @@ where
// If we have accumulated only a narrow band of keyspace, create an
// image layer. Otherwise write a delta layer.
// FIXME: deal with the case of lots of values for same key
// FIXME: we are ignoring images here. Did we already divide the work
// so that we won't encounter them here?
@@ -541,100 +535,42 @@ where
}
}
// Open stream
let key_value_stream =
std::pin::pin!(merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
.await?
.map(Result::<_, anyhow::Error>::Ok));
let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
let mut new_jobs = Vec::new();
// Slide a window through the keyspace
let mut key_accum =
std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
let mut all_in_window: bool = false;
let mut window = Window::new();
// Helper function to create a job for a new delta layer with given key-lsn
// rectangle.
let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
// The inputs for the job are all the input layers of the original job that
// overlap with the rectangle.
let batch_layers: Vec<LayerId> = job
.input_layers
.iter()
.filter(|layer_id| {
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
})
.cloned()
.collect();
assert!(!batch_layers.is_empty());
new_jobs.push(CompactionJob {
key_range,
lsn_range: lsn_range.clone(),
strategy: CompactionStrategy::CreateDelta,
input_layers: batch_layers,
completed: false,
});
};
loop {
if all_in_window && window.is_empty() {
if all_in_window && window.elems.is_empty() {
// All done!
break;
}
// If we now have enough keyspace for next delta layer in the window, create a
// new delta layer
if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
{
create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
continue;
}
assert!(!all_in_window);
// Process next key in the key space
match key_accum.next().await.transpose()? {
None => {
all_in_window = true;
}
Some(next_key) if next_key.partition_lsns.is_empty() => {
// Normal case: extend the window by the key
let batch_layers: Vec<LayerId> = job
.input_layers
.iter()
.filter(|layer_id| {
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
})
.cloned()
.collect();
assert!(!batch_layers.is_empty());
new_jobs.push(CompactionJob {
key_range,
lsn_range: job.lsn_range.clone(),
strategy: CompactionStrategy::CreateDelta,
input_layers: batch_layers,
completed: false,
});
} else {
assert!(!all_in_window);
if let Some(next_key) = key_accum.next().await.transpose()? {
window.feed(next_key.key, next_key.size);
}
Some(next_key) => {
// A key with too large size impact for a single delta layer. This
// case occurs if you make a huge number of updates for a single key.
//
// Drain the window with has_more = false to make a clean cut before
// the key, and then make dedicated delta layers for the single key.
//
// We cannot cluster the key with the others, because we don't want
// layer files to overlap with each other in the lsn,key space (no
// overlaps for the rectangles).
let key = next_key.key;
debug!("key {key} with size impact larger than the layer size");
while !window.is_empty() {
let has_more = false;
let key_range = window.choose_next_delta(self.target_file_size, has_more)
.expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
}
// Not really required: but here for future resilience:
// We make a "gap" here, so any structure the window holds should
// probably be reset.
window = Window::new();
let mut prior_lsn = job.lsn_range.start;
let mut lsn_ranges = Vec::new();
for (lsn, _size) in next_key.partition_lsns.iter() {
lsn_ranges.push(prior_lsn..*lsn);
prior_lsn = *lsn;
}
lsn_ranges.push(prior_lsn..job.lsn_range.end);
for lsn_range in lsn_ranges {
let key_range = key..key.next();
create_delta_job(key_range, &lsn_range, &mut new_jobs);
}
} else {
all_in_window = true;
}
}
}
@@ -663,8 +599,8 @@ where
}
}
/// Sliding window through keyspace and values for image layer
/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
// Sliding window through keyspace and values
// This is used by over_with_images to decide on good split points
struct KeyspaceWindow<K> {
head: KeyspaceWindowHead<K>,
@@ -804,9 +740,9 @@ struct WindowElement<K> {
accum_size: u64,
}
/// Sliding window through keyspace and values for delta layer tiling
///
/// This is used to decide which delta layer to write next.
// Sliding window through keyspace and values
//
// This is used to decide what layer to write next, from the beginning of the window.
struct Window<K> {
elems: VecDeque<WindowElement<K>>,
@@ -830,13 +766,11 @@ where
fn feed(&mut self, key: K, size: u64) {
let last_size;
if let Some(last) = self.elems.back_mut() {
// We require the keys to be strictly increasing for the window.
// Keys should already have been deduplicated by `accum_key_values`
assert!(
last.last_key < key,
"last_key(={}) >= key(={key})",
last.last_key
);
assert!(last.last_key <= key);
if key == last.last_key {
last.accum_size += size;
return;
}
last_size = last.accum_size;
} else {
last_size = 0;
@@ -858,10 +792,6 @@ where
self.elems.front().unwrap().accum_size - self.splitoff_size
}
fn is_empty(&self) -> bool {
self.elems.is_empty()
}
fn commit_upto(&mut self, mut upto: usize) {
while upto > 1 {
let popped = self.elems.pop_front().unwrap();
@@ -924,7 +854,7 @@ where
// If we're willing to stretch it up to 1.25 target size, could we
// gobble up the rest of the work? This avoids creating very small
// "tail" layers at the end of the keyspace
if !has_more && self.remain_size() < target_size * 5 / 4 {
if !has_more && self.remain_size() < target_size * 5 / 3 {
self.commit_upto(self.elems.len());
} else {
let delta_split_at = self.find_size_split(target_size);

View File

@@ -9,14 +9,10 @@ use pageserver_api::shard::ShardIdentity;
use pin_project_lite::pin_project;
use std::collections::BinaryHeap;
use std::collections::VecDeque;
use std::fmt::Display;
use std::future::Future;
use std::ops::{DerefMut, Range};
use std::pin::Pin;
use std::task::{ready, Poll};
use utils::lsn::Lsn;
pub const PAGE_SZ: u64 = 8192;
pub fn keyspace_total_size<K>(
keyspace: &CompactionKeySpace<K>,
@@ -112,40 +108,17 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
}
}
pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
layers: &'a [E::DeltaLayer],
ctx: &'a E::RequestContext,
) -> anyhow::Result<impl Stream<Item = <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>
{
let mut keys = Vec::new();
for l in layers {
// Boxing and casting to LoadFuture is required to obtain the right Sync bound.
// If we do l.load_keys(ctx).await? directly, there is a compilation error.
let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx));
keys.extend(load_future.await?.into_iter());
}
keys.sort_by_key(|k| (k.key(), k.lsn()));
let stream = futures::stream::iter(keys.into_iter());
Ok(stream)
}
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
Unloaded(&'a E::DeltaLayer),
}
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
fn min_key(&self) -> E::Key {
fn key(&self) -> E::Key {
match self {
Self::Loaded(entries) => entries.front().unwrap().key(),
Self::Unloaded(dl) => dl.key_range().start,
}
}
fn min_lsn(&self) -> Lsn {
match self {
Self::Loaded(entries) => entries.front().unwrap().lsn(),
Self::Unloaded(dl) => dl.lsn_range().start,
}
}
}
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
@@ -155,12 +128,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
// reverse order so that we get a min-heap
(other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
other.key().cmp(&self.key())
}
}
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == std::cmp::Ordering::Equal
self.key().eq(&other.key())
}
}
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
@@ -237,16 +210,11 @@ pub struct KeySize<K> {
pub key: K,
pub num_values: u64,
pub size: u64,
/// The lsns to partition at (if empty then no per-lsn partitioning)
pub partition_lsns: Vec<(Lsn, u64)>,
}
pub fn accum_key_values<'a, I, K, D, E>(
input: I,
target_size: u64,
) -> impl Stream<Item = Result<KeySize<K>, E>>
pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
where
K: Eq + PartialOrd + Display + Copy,
K: Eq,
I: Stream<Item = Result<D, E>>,
D: CompactionDeltaEntry<'a, K>,
{
@@ -256,39 +224,25 @@ where
if let Some(first) = input.next().await {
let first = first?;
let mut part_size = first.size();
let mut accum: KeySize<K> = KeySize {
key: first.key(),
num_values: 1,
size: part_size,
partition_lsns: Vec::new(),
size: first.size(),
};
let mut last_key = accum.key;
while let Some(this) = input.next().await {
let this = this?;
if this.key() == accum.key {
let add_size = this.size();
if part_size + add_size > target_size {
accum.partition_lsns.push((this.lsn(), part_size));
part_size = 0;
}
part_size += add_size;
accum.size += add_size;
accum.size += this.size();
accum.num_values += 1;
} else {
assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
last_key = accum.key;
yield accum;
part_size = this.size();
accum = KeySize {
key: this.key(),
num_values: 1,
size: part_size,
partition_lsns: Vec::new(),
size: this.size(),
};
}
}
assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
yield accum;
}
}

View File

@@ -184,12 +184,6 @@ impl<L> Level<L> {
}
let mut events: Vec<Event<K>> = Vec::new();
for (idx, l) in self.layers.iter().enumerate() {
let key_range = l.key_range();
if key_range.end == key_range.start.next() && l.is_delta() {
// Ignore single-key delta layers as they can be stacked on top of each other
// as that is the only way to cut further.
continue;
}
events.push(Event {
key: l.key_range().start,
layer_idx: idx,

View File

@@ -14,7 +14,6 @@ use std::ops::Range;
use std::sync::Arc;
use std::sync::Mutex;
use crate::helpers::PAGE_SZ;
use crate::helpers::{merge_delta_keys, overlaps_with};
use crate::interface;
@@ -510,7 +509,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
let new_layer = Arc::new(MockImageLayer {
key_range: key_range.clone(),
lsn_range: lsn..lsn,
file_size: accum_size * PAGE_SZ,
file_size: accum_size * 8192,
deleted: Mutex::new(false),
});
info!(

View File

@@ -1,35 +1,23 @@
use once_cell::sync::OnceCell;
use pageserver_compaction::interface::CompactionLayer;
use pageserver_compaction::simulator::MockTimeline;
use utils::logging;
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
pub(crate) fn setup_logging() {
LOG_HANDLE.get_or_init(|| {
logging::init(
logging::LogFormat::Test,
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)
.expect("Failed to init test logging")
});
}
/// Test the extreme case that there are so many updates for a single key that
/// even if we produce an extremely narrow delta layer, spanning just that one
/// key, we still too many records to fit in the target file size. We need to
/// split in the LSN dimension too in that case.
///
/// TODO: The code to avoid this problem has not been implemented yet! So the
/// assertion currently fails, but we need to make it not fail.
#[ignore]
#[tokio::test]
async fn test_many_updates_for_single_key() {
setup_logging();
let mut executor = MockTimeline::new();
executor.target_file_size = 1_000_000; // 1 MB
executor.target_file_size = 10_000_000; // 10 MB
// Ingest 10 MB of updates to a single key.
// Ingest 100 MB of updates to a single key.
for _ in 1..1000 {
executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
executor.ingest_uniform(1000, 10, &(0..1)).unwrap();
executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
executor.compact().await.unwrap();
}
@@ -39,32 +27,9 @@ async fn test_many_updates_for_single_key() {
}
for l in executor.live_layers.iter() {
assert!(l.file_size() < executor.target_file_size * 2);
// Sanity check that none of the delta layers are empty either.
// sanity check that none of the delta layers are stupidly small either
if l.is_delta() {
assert!(l.file_size() > 0);
assert!(l.file_size() > executor.target_file_size / 2);
}
}
}
#[tokio::test]
async fn test_simple_updates() {
setup_logging();
let mut executor = MockTimeline::new();
executor.target_file_size = 500_000; // 500 KB
// Ingest some traffic.
for _ in 1..400 {
executor.ingest_uniform(100, 500, &(0..100_000)).unwrap();
}
for l in executor.live_layers.iter() {
println!("layer {}: {}", l.short_id(), l.file_size());
}
println!("Running compaction...");
executor.compact().await.unwrap();
for l in executor.live_layers.iter() {
println!("layer {}: {}", l.short_id(), l.file_size());
}
}

View File

@@ -28,8 +28,6 @@
//! # From an `index_part.json` in S3
//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
//!
//! # enrich with lines for gc_cutoff and a child branch point
//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg
//! ```
//!
//! ## Viewing
@@ -50,8 +48,9 @@
//! ```
//!
use anyhow::{Context, Result};
use anyhow::Result;
use pageserver::repository::Key;
use pageserver::METADATA_FILE_NAME;
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::path::PathBuf;
@@ -82,11 +81,6 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
let split: Vec<&str> = name.split("__").collect();
let keys: Vec<&str> = split[0].split('-').collect();
let mut lsns: Vec<&str> = split[1].split('-').collect();
if lsns.last().expect("should").len() == 8 {
lsns.pop();
}
if lsns.len() == 1 {
lsns.push(lsns[0]);
}
@@ -96,33 +90,6 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
(keys, lsns)
}
#[derive(Clone, Copy)]
enum LineKind {
GcCutoff,
Branch,
}
impl From<LineKind> for Fill {
fn from(value: LineKind) -> Self {
match value {
LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
}
}
}
impl FromStr for LineKind {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::prelude::v1::Result<Self, Self::Err> {
Ok(match s {
"gc_cutoff" => LineKind::GcCutoff,
"branch" => LineKind::Branch,
_ => anyhow::bail!("unsupported linekind: {s}"),
})
}
}
pub fn main() -> Result<()> {
// Parse layer filenames from stdin
struct Layer {
@@ -132,32 +99,15 @@ pub fn main() -> Result<()> {
}
let mut files: Vec<Layer> = vec![];
let stdin = io::stdin();
let mut lines: Vec<(Lsn, LineKind)> = vec![];
for (lineno, line) in stdin.lock().lines().enumerate() {
let lineno = lineno + 1;
for line in stdin.lock().lines() {
let line = line.unwrap();
if let Some((kind, lsn)) = line.split_once(':') {
let (kind, lsn) = LineKind::from_str(kind)
.context("parse kind")
.and_then(|kind| {
if lsn.contains('/') {
Lsn::from_str(lsn)
} else {
Lsn::from_hex(lsn)
}
.map(|lsn| (kind, lsn))
.context("parse lsn")
})
.with_context(|| format!("parse {line:?} on {lineno}"))?;
lines.push((lsn, kind));
continue;
}
let line = PathBuf::from_str(&line).unwrap();
let filename = line.file_name().unwrap();
let filename = filename.to_str().unwrap();
if filename == METADATA_FILE_NAME {
// Don't try and parse "metadata" like a key-lsn range
continue;
}
let (key_range, lsn_range) = parse_filename(filename);
files.push(Layer {
filename: filename.to_owned(),
@@ -167,9 +117,8 @@ pub fn main() -> Result<()> {
}
// Collect all coordinates
let mut keys: Vec<Key> = Vec::with_capacity(files.len());
let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
let mut keys: Vec<Key> = vec![];
let mut lsns: Vec<Lsn> = vec![];
for Layer {
key_range: keyr,
lsn_range: lsnr,
@@ -182,8 +131,6 @@ pub fn main() -> Result<()> {
lsns.push(lsnr.end);
}
lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
// Analyze
let key_map = build_coordinate_compression_map(keys);
let lsn_map = build_coordinate_compression_map(lsns);
@@ -197,13 +144,10 @@ pub fn main() -> Result<()> {
println!(
"{}",
BeginSvg {
w: (key_map.len() + 10) as f32,
w: key_map.len() as f32,
h: stretch * lsn_map.len() as f32
}
);
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
for Layer {
filename,
key_range: keyr,
@@ -225,6 +169,7 @@ pub fn main() -> Result<()> {
let mut lsn_diff = (lsn_end - lsn_start) as f32;
let mut fill = Fill::None;
let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
let mut lsn_offset = 0.0;
// Fill in and thicken rectangle if it's an
@@ -244,7 +189,7 @@ pub fn main() -> Result<()> {
println!(
" {}",
rectangle(
5.0 + key_start as f32 + stretch * xmargin,
key_start as f32 + stretch * xmargin,
stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
key_diff as f32 - stretch * 2.0 * xmargin,
stretch * (lsn_diff - 2.0 * ymargin)
@@ -255,26 +200,6 @@ pub fn main() -> Result<()> {
.comment(filename)
);
}
for (lsn, kind) in lines {
let lsn_start = *lsn_map.get(&lsn).unwrap();
let lsn_end = lsn_start;
let stretch = 2.0;
let lsn_diff = 0.3;
let lsn_offset = -lsn_diff / 2.0;
let ymargin = 0.05;
println!(
"{}",
rectangle(
0.0f32 + stretch * xmargin,
stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
(key_map.len() + 10) as f32,
stretch * (lsn_diff - 2.0 * ymargin)
)
.fill(kind)
);
}
println!("{}", EndSvg);
eprintln!("num_images: {}", num_images);

View File

@@ -3,7 +3,7 @@ use std::collections::HashMap;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
#[derive(serde::Serialize)]
struct Output<'a> {
layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
disk_consistent_lsn: Lsn,
timeline_metadata: &'a TimelineMetadata,
}

View File

@@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
let file = VirtualFile::open(path, ctx).await?;
let file = VirtualFile::open(path).await?;
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader.read_blk(0, ctx).await?;

View File

@@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
page_cache::init(100);
let file = VirtualFile::open(path, ctx).await?;
let file = VirtualFile::open(path).await?;
let file_id = page_cache::next_file_id();
let block_reader = FileBlockReader::new(&file, file_id);
let summary_blk = block_reader.read_blk(0, ctx).await?;

View File

@@ -219,7 +219,6 @@ fn handle_metadata(
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
// TODO: simplify this part
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
meta = TimelineMetadata::new(
*disk_consistent_lsn,

View File

@@ -1,98 +0,0 @@
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
use pageserver_api::shard::TenantShardId;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
use std::collections::HashMap;
use std::sync::Arc;
/// Ingest aux files into the pageserver.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
page_service_connstring: String,
#[clap(long)]
pageserver_jwt: Option<String>,
targets: Option<Vec<TenantTimelineId>>,
}
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
let rt = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
let main_task = rt.spawn(main_impl(args));
rt.block_on(main_task).unwrap()
}
async fn main_impl(args: Args) -> anyhow::Result<()> {
let args: &'static Args = Box::leak(Box::new(args));
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
args.mgmt_api_endpoint.clone(),
args.pageserver_jwt.as_deref(),
));
// discover targets
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
&mgmt_api_client,
crate::util::cli::targets::Spec {
limit_to_first_n_targets: None,
targets: {
if let Some(targets) = &args.targets {
if targets.len() != 1 {
anyhow::bail!("must specify exactly one target");
}
Some(targets.clone())
} else {
None
}
},
},
)
.await?;
let timeline = timelines[0];
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
let timeline_id = timeline.timeline_id;
println!("operating on timeline {}", timeline);
mgmt_api_client
.tenant_config(&TenantConfigRequest {
tenant_id: timeline.tenant_id,
config: TenantConfig {
switch_aux_file_policy: Some(AuxFilePolicy::V2),
..Default::default()
},
})
.await?;
for batch in 0..100 {
let items = (0..100)
.map(|id| {
(
format!("pg_logical/mappings/{:03}.{:03}", batch, id),
format!("{:08}", id),
)
})
.collect::<HashMap<_, _>>();
let file_cnt = items.len();
mgmt_api_client
.ingest_aux_files(tenant_shard_id, timeline_id, items)
.await?;
println!("ingested {file_cnt} files");
}
let files = mgmt_api_client
.list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
.await?;
println!("{} files found", files.len());
anyhow::Ok(())
}

View File

@@ -2,11 +2,9 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
use pageserver_client::mgmt_api;
use rand::seq::SliceRandom;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info};
use utils::id::{TenantTimelineId, TimelineId};
use std::{f64, sync::Arc};
use tokio::{
sync::{mpsc, OwnedSemaphorePermit},
task::JoinSet,
@@ -14,7 +12,10 @@ use tokio::{
use std::{
num::NonZeroUsize,
sync::atomic::{AtomicU64, Ordering},
sync::{
atomic::{AtomicU64, Ordering},
Arc,
},
time::{Duration, Instant},
};
@@ -50,31 +51,19 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
Ok(())
}
#[derive(serde::Serialize)]
struct Output {
downloads_count: u64,
downloads_bytes: u64,
evictions_count: u64,
timeline_restarts: u64,
#[serde(with = "humantime_serde")]
runtime: Duration,
}
#[derive(Debug, Default)]
struct LiveStats {
evictions_count: AtomicU64,
downloads_count: AtomicU64,
downloads_bytes: AtomicU64,
evictions: AtomicU64,
downloads: AtomicU64,
timeline_restarts: AtomicU64,
}
impl LiveStats {
fn eviction_done(&self) {
self.evictions_count.fetch_add(1, Ordering::Relaxed);
self.evictions.fetch_add(1, Ordering::Relaxed);
}
fn download_done(&self, size: u64) {
self.downloads_count.fetch_add(1, Ordering::Relaxed);
self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
fn download_done(&self) {
self.downloads.fetch_add(1, Ordering::Relaxed);
}
fn timeline_restart_done(&self) {
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
@@ -103,49 +92,28 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
)
.await?;
let token = CancellationToken::new();
let mut tasks = JoinSet::new();
let periodic_stats = Arc::new(LiveStats::default());
let total_stats = Arc::new(LiveStats::default());
let start = Instant::now();
let live_stats = Arc::new(LiveStats::default());
tasks.spawn({
let periodic_stats = Arc::clone(&periodic_stats);
let total_stats = Arc::clone(&total_stats);
let cloned_token = token.clone();
let live_stats = Arc::clone(&live_stats);
async move {
let mut last_at = Instant::now();
loop {
if cloned_token.is_cancelled() {
return;
}
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
let now = Instant::now();
let delta: Duration = now - last_at;
last_at = now;
let LiveStats {
evictions_count,
downloads_count,
downloads_bytes,
evictions,
downloads,
timeline_restarts,
} = &*periodic_stats;
let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
} = &*live_stats;
let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
}
}
});
@@ -156,42 +124,14 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
args,
Arc::clone(&mgmt_api_client),
tl,
Arc::clone(&periodic_stats),
token.clone(),
Arc::clone(&live_stats),
));
}
}
if let Some(runtime) = args.runtime {
tokio::spawn(async move {
tokio::time::sleep(runtime.into()).await;
token.cancel();
});
}
while let Some(res) = tasks.join_next().await {
res.unwrap();
}
let end = Instant::now();
let duration: Duration = end - start;
let output = {
let LiveStats {
evictions_count,
downloads_count,
downloads_bytes,
timeline_restarts,
} = &*total_stats;
Output {
downloads_count: downloads_count.load(Ordering::Relaxed),
downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
evictions_count: evictions_count.load(Ordering::Relaxed),
timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
runtime: duration,
}
};
let output = serde_json::to_string_pretty(&output).unwrap();
println!("{output}");
Ok(())
}
@@ -200,7 +140,6 @@ async fn timeline_actor(
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
timeline: TenantTimelineId,
live_stats: Arc<LiveStats>,
token: CancellationToken,
) {
// TODO: support sharding
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
@@ -210,7 +149,7 @@ async fn timeline_actor(
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
concurrency: Arc<tokio::sync::Semaphore>,
}
while !token.is_cancelled() {
loop {
debug!("restarting timeline");
let layer_map_info = mgmt_api_client
.layer_map_info(tenant_shard_id, timeline.timeline_id)
@@ -246,7 +185,7 @@ async fn timeline_actor(
live_stats.timeline_restart_done();
while !token.is_cancelled() {
loop {
assert!(!timeline.joinset.is_empty());
if let Some(res) = timeline.joinset.try_join_next() {
debug!(?res, "a layer actor exited, should not happen");
@@ -316,7 +255,7 @@ async fn layer_actor(
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
.await
.unwrap();
live_stats.download_done(layer.layer_file_size());
live_stats.download_done();
did_it
}
};

View File

@@ -14,7 +14,6 @@ mod util {
/// The pagebench CLI sub-commands, dispatched in [`main`] below.
mod cmd {
pub(super) mod aux_files;
pub(super) mod basebackup;
pub(super) mod getpage_latest_lsn;
pub(super) mod ondemand_download_churn;
@@ -28,7 +27,6 @@ enum Args {
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
AuxFiles(cmd::aux_files::Args),
}
fn main() {
@@ -48,7 +46,6 @@ fn main() {
cmd::trigger_initial_size_calculation::main(args)
}
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
Args::AuxFiles(args) => cmd::aux_files::main(args),
}
.unwrap()
}

View File

@@ -1,39 +1,20 @@
use std::sync::Arc;
use ::metrics::IntGauge;
use bytes::{Buf, BufMut, Bytes};
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
use sha2::Digest;
use tracing::warn;
// BEGIN Copyright (c) 2017 Servo Contributors
/// Const version of FNV hash.
#[inline]
#[must_use]
pub const fn fnv_hash(bytes: &[u8]) -> u128 {
const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
const PRIME: u128 = 0x0000000001000000000000000000013B;
let mut hash = INITIAL_STATE;
let mut i = 0;
while i < bytes.len() {
hash ^= bytes[i] as u128;
hash = hash.wrapping_mul(PRIME);
i += 1;
}
hash
fn hash256(data: &[u8]) -> [u8; 32] {
sha2::Sha256::digest(data).into()
}
// END Copyright (c) 2017 Servo Contributors
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b sha256].
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
let hash = fnv_hash(data).to_be_bytes();
let mut key = [0; METADATA_KEY_SIZE];
let hash = hash256(data);
key[0] = AUX_KEY_PREFIX;
key[1] = dir_level1;
key[2] = dir_level2;
key[3..16].copy_from_slice(&hash[3..16]);
key[3..16].copy_from_slice(&hash[0..13]);
Key::from_metadata_key_fixed_size(&key)
}
@@ -164,55 +145,6 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
Ok(encoded)
}
/// An estimation of the size of aux files.
pub struct AuxFileSizeEstimator {
aux_file_size_gauge: IntGauge,
size: Arc<std::sync::Mutex<Option<isize>>>,
}
impl AuxFileSizeEstimator {
pub fn new(aux_file_size_gauge: IntGauge) -> Self {
Self {
aux_file_size_gauge,
size: Arc::new(std::sync::Mutex::new(None)),
}
}
pub fn on_base_backup(&self, new_size: usize) {
let mut guard = self.size.lock().unwrap();
*guard = Some(new_size as isize);
self.report(new_size as isize);
}
pub fn on_add(&self, file_size: usize) {
let mut guard = self.size.lock().unwrap();
if let Some(size) = &mut *guard {
*size += file_size as isize;
self.report(*size);
}
}
pub fn on_remove(&self, file_size: usize) {
let mut guard = self.size.lock().unwrap();
if let Some(size) = &mut *guard {
*size -= file_size as isize;
self.report(*size);
}
}
pub fn on_update(&self, old_size: usize, new_size: usize) {
let mut guard = self.size.lock().unwrap();
if let Some(size) = &mut *guard {
*size += new_size as isize - old_size as isize;
self.report(*size);
}
}
pub fn report(&self, size: isize) {
self.aux_file_size_gauge.set(size as i64);
}
}
#[cfg(test)]
mod tests {
use super::*;
@@ -221,18 +153,17 @@ mod tests {
fn test_hash_portable() {
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
// if the algorithm produces the same hash across different environments.
assert_eq!(
265160408618497461376862998434862070044,
super::fnv_hash("test1".as_bytes())
"1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014",
hex::encode(hash256("test1".as_bytes()))
);
assert_eq!(
295486155126299629456360817749600553988,
super::fnv_hash("test/test2".as_bytes())
"3dfc364fd121aaa081cec54ef9ef6f69a4756df50cf3c52f1abd2b451829c4c0",
hex::encode(hash256("test/test2".as_bytes()))
);
assert_eq!(
144066263297769815596495629667062367629,
super::fnv_hash("".as_bytes())
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
hex::encode(hash256("".as_bytes()))
);
}
@@ -241,28 +172,28 @@ mod tests {
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
// of the page server.
assert_eq!(
"62000001017F8B83D94F7081693471ABF91C",
encode_aux_file_key("pg_logical/mappings/test1").to_string(),
"62000001011B4F0E9851971998E732078544",
encode_aux_file_key("pg_logical/mappings/test1").to_string()
);
assert_eq!(
"62000001027F8E83D94F7081693471ABFCCD",
encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
"620000010260303AE22B998861BCE3B28F33",
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
);
assert_eq!(
"62000001032E07BB014262B821756295C58D",
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
"6200000103E3B0C44298FC1C149AFBF4C899",
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
);
assert_eq!(
"62000001FF4F38E1C74754E7D03C1A660178",
encode_aux_file_key("pg_logical/unsupported").to_string(),
"62000001FF60945D49535300BE8E42108658",
encode_aux_file_key("pg_logical/unsupported").to_string()
);
assert_eq!(
"62000002017F8D83D94F7081693471ABFB92",
"6200000201FD61A03AF4F77D870FC21E05E7",
encode_aux_file_key("pg_replslot/test3").to_string()
);
assert_eq!(
"620000FFFF2B6ECC8AEF93F643DC44F15E03",
encode_aux_file_key("other_file_not_supported").to_string(),
"620000FFFF7F75AD39B73CD2A1FE41680550",
encode_aux_file_key("other_file_not_supported").to_string()
);
}

View File

@@ -601,7 +601,7 @@ where
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
if self.timeline.is_ancestor_lsn(self.lsn) {
if self.lsn == self.timeline.get_ancestor_lsn() {
write!(zenith_signal, "PREV LSN: none")
.map_err(|e| BasebackupError::Server(e.into()))?;
} else {

View File

@@ -284,6 +284,7 @@ fn start_pageserver(
))
.unwrap();
pageserver::preinitialize_metrics();
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
// If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes
@@ -515,12 +516,16 @@ fn start_pageserver(
}
});
let secondary_controller = secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
);
let secondary_controller = if let Some(remote_storage) = &remote_storage {
secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
)
} else {
secondary::null_controller()
};
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -528,13 +533,15 @@ fn start_pageserver(
// been configured.
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
launch_disk_usage_global_eviction_task(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
tenant_manager.clone(),
background_jobs_barrier.clone(),
)?;
if let Some(remote_storage) = &remote_storage {
launch_disk_usage_global_eviction_task(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
tenant_manager.clone(),
background_jobs_barrier.clone(),
)?;
}
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
@@ -647,20 +654,17 @@ fn start_pageserver(
None,
"libpq endpoint listener",
true,
{
let tenant_manager = tenant_manager.clone();
async move {
page_service::libpq_listener_main(
tenant_manager,
broker_client,
pg_auth,
pageserver_listener,
conf.pg_auth_type,
libpq_ctx,
task_mgr::shutdown_token(),
)
.await
}
async move {
page_service::libpq_listener_main(
conf,
broker_client,
pg_auth,
pageserver_listener,
conf.pg_auth_type,
libpq_ctx,
task_mgr::shutdown_token(),
)
.await
},
);
}
@@ -689,7 +693,14 @@ fn start_pageserver(
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
)
.await;
unreachable!()
})
}
@@ -697,11 +708,12 @@ fn start_pageserver(
fn create_remote_storage_client(
conf: &'static PageServerConf,
) -> anyhow::Result<GenericRemoteStorage> {
) -> anyhow::Result<Option<GenericRemoteStorage>> {
let config = if let Some(config) = &conf.remote_storage_config {
config
} else {
anyhow::bail!("no remote storage configured, this is a deprecated configuration");
tracing::warn!("no remote storage configured, this is a deprecated configuration");
return Ok(None);
};
// Create the client
@@ -721,7 +733,7 @@ fn create_remote_storage_client(
GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
}
Ok(remote_storage)
Ok(Some(remote_storage))
}
fn cli() -> Command {

View File

@@ -99,7 +99,7 @@ pub mod defaults {
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
///
/// Default built-in configuration file.

View File

@@ -38,7 +38,7 @@ use deleter::DeleterMessage;
use list_writer::ListWriterQueueMessage;
use validator::ValidatorQueueMessage;
use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
// TODO: configurable for how long to wait before executing deletions
@@ -479,7 +479,7 @@ impl DeletionQueueClient {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
) -> Result<(), DeletionQueueError> {
if current_generation.is_none() {
debug!("Enqueuing deletions in legacy mode, skipping queue");
@@ -511,7 +511,7 @@ impl DeletionQueueClient {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
) -> Result<(), DeletionQueueError> {
metrics::DELETION_QUEUE
.keys_submitted
@@ -632,7 +632,7 @@ impl DeletionQueue {
///
/// If remote_storage is None, then the returned workers will also be None.
pub fn new<C>(
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
control_plane_client: Option<C>,
conf: &'static PageServerConf,
) -> (Self, Option<DeletionQueueWorkers<C>>)
@@ -658,6 +658,23 @@ impl DeletionQueue {
// longer to flush after Tenants have all been torn down.
let cancel = CancellationToken::new();
let remote_storage = match remote_storage {
None => {
return (
Self {
client: DeletionQueueClient {
tx,
executor_tx,
lsn_table: lsn_table.clone(),
},
cancel,
},
None,
)
}
Some(r) => r,
};
(
Self {
client: DeletionQueueClient {
@@ -717,20 +734,20 @@ mod test {
use crate::{
control_plane_client::RetryForeverError,
repository::Key,
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
};
use super::*;
pub const TIMELINE_ID: TimelineId =
TimelineId::from_array(hex!("11223344556677881122334455667788"));
pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
});
// When you need a second layer in a test.
pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
});
@@ -748,7 +765,7 @@ mod test {
/// Simulate a pageserver restart by destroying and recreating the deletion queue
async fn restart(&mut self) {
let (deletion_queue, workers) = DeletionQueue::new(
self.storage.clone(),
Some(self.storage.clone()),
Some(self.mock_control_plane.clone()),
self.harness.conf,
);
@@ -780,7 +797,7 @@ mod test {
/// Returns remote layer file name, suitable for use in assert_remote_files
fn write_remote_layer(
&self,
file_name: LayerName,
file_name: LayerFileName,
gen: Generation,
) -> anyhow::Result<String> {
let tenant_shard_id = self.harness.tenant_shard_id;
@@ -858,7 +875,7 @@ mod test {
let mock_control_plane = MockControlPlane::new();
let (deletion_queue, worker) = DeletionQueue::new(
storage.clone(),
Some(storage.clone()),
Some(mock_control_plane.clone()),
harness.conf,
);
@@ -935,7 +952,7 @@ mod test {
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;
let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let tenant_shard_id = ctx.harness.tenant_shard_id;
let content: Vec<u8> = "victim1 contents".into();

View File

@@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::LayerFileMetadata;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::storage_layer::LayerFileName;
use crate::virtual_file::on_fatal_io_error;
use crate::virtual_file::MaybeFatalIo;
@@ -59,7 +59,7 @@ pub(super) struct DeletionOp {
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
// to do it for you.
pub(super) layers: Vec<(LayerName, LayerFileMetadata)>,
pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant shard attachment in which we are enqueuing

View File

@@ -64,7 +64,7 @@ use crate::{
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
},
};
@@ -535,11 +535,17 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
}
EvictionLayer::Secondary(layer) => {
let file_size = layer.metadata.file_size();
let tenant_manager = tenant_manager.clone();
js.spawn(async move {
layer
.secondary_tenant
.evict_layer(layer.timeline_id, layer.name)
.evict_layer(
tenant_manager.get_conf(),
layer.timeline_id,
layer.name,
layer.metadata,
)
.await;
Ok(file_size)
});
@@ -598,7 +604,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
pub(crate) struct EvictionSecondaryLayer {
pub(crate) secondary_tenant: Arc<SecondaryTenant>,
pub(crate) timeline_id: TimelineId,
pub(crate) name: LayerName,
pub(crate) name: LayerFileName,
pub(crate) metadata: LayerFileMetadata,
}
@@ -631,9 +637,9 @@ impl EvictionLayer {
}
}
pub(crate) fn get_name(&self) -> LayerName {
pub(crate) fn get_name(&self) -> LayerFileName {
match self {
Self::Attached(l) => l.layer_desc().layer_name(),
Self::Attached(l) => l.layer_desc().filename(),
Self::Secondary(sl) => sl.name.clone(),
}
}

View File

@@ -420,6 +420,25 @@ paths:
description: Tenant scheduled to load successfully
/v1/tenant/{tenant_id}/synthetic_size:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
get:
description: |
Calculate tenant's synthetic size
responses:
"200":
description: Tenant's synthetic size
content:
application/json:
schema:
$ref: "#/components/schemas/SyntheticSizeResponse"
# This route has no handler. TODO: remove?
/v1/tenant/{tenant_id}/size:
parameters:
- name: tenant_id
in: path
@@ -449,9 +468,19 @@ paths:
content:
application/json:
schema:
$ref: "#/components/schemas/SyntheticSizeResponse"
text/html:
description: SVG representation of the tenant and it's timelines.
type: object
required:
- id
- size
properties:
id:
type: string
format: hex
size:
type: integer
nullable: true
description: |
Size metric in bytes or null if inputs_only=true was given.
"401":
description: Unauthorized Error
content:
@@ -900,9 +929,6 @@ components:
format: hex
size:
type: integer
nullable: true
description: |
Size metric in bytes or null if inputs_only=true was given.
segment_sizes:
type: array
items:

View File

@@ -1,8 +1,6 @@
//!
//! Management HTTP API
//!
use std::cmp::Reverse;
use std::collections::BinaryHeap;
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::Arc;
@@ -16,8 +14,6 @@ use hyper::header;
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::ShardParameters;
@@ -28,11 +24,7 @@ use pageserver_api::models::TenantScanRemoteStorageShard;
use pageserver_api::models::TenantShardLocation;
use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::TenantSorting;
use pageserver_api::models::TenantState;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::TopTenantShardsRequest;
use pageserver_api::models::TopTenantShardsResponse;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
TenantLoadRequest, TenantLocationConfigRequest,
@@ -71,7 +63,7 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::Timeline;
use crate::tenant::SpawnMode;
@@ -112,7 +104,7 @@ pub struct State {
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
allowlist_routes: Vec<Uri>,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
@@ -126,7 +118,7 @@ impl State {
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
@@ -441,8 +433,6 @@ async fn build_timeline_info_common(
state,
walreceiver_status,
last_aux_file_policy: timeline.last_aux_file_policy.load(),
};
Ok(info)
}
@@ -823,6 +813,12 @@ async fn tenant_attach_handler(
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
if state.remote_storage.is_none() {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
)));
}
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let shard_params = ShardParameters::default();
let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
@@ -1233,7 +1229,7 @@ async fn layer_download_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let layer_file_name = get_request_param(&request, "layer_file_name")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let layer_name = LayerName::from_str(layer_file_name)
let layer_name = LayerFileName::from_str(layer_file_name)
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
let state = get_state(&request);
@@ -1265,7 +1261,7 @@ async fn evict_timeline_layer_handler(
let layer_file_name = get_request_param(&request, "layer_file_name")?;
let state = get_state(&request);
let layer_name = LayerName::from_str(layer_file_name)
let layer_name = LayerFileName::from_str(layer_file_name)
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
let timeline =
@@ -1647,6 +1643,12 @@ async fn tenant_time_travel_remote_storage_handler(
)));
}
let Some(storage) = state.remote_storage.as_ref() else {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run time travel"
)));
};
if timestamp > done_if_after {
return Err(ApiError::BadRequest(anyhow!(
"The done_if_after timestamp comes before the timestamp to recover to"
@@ -1656,7 +1658,7 @@ async fn tenant_time_travel_remote_storage_handler(
tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
remote_timeline_client::upload::time_travel_recover_tenant(
&state.remote_storage,
storage,
&tenant_shard_id,
timestamp,
done_if_after,
@@ -1713,7 +1715,12 @@ async fn timeline_gc_handler(
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
let gc_result = wait_task_done
.await
.context("wait for gc task")
.map_err(ApiError::InternalServerError)?
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, gc_result)
}
@@ -1901,6 +1908,11 @@ async fn deletion_queue_flush(
) -> Result<Response<Body>, ApiError> {
let state = get_state(&r);
if state.remote_storage.is_none() {
// Nothing to do if remote storage is disabled.
return json_response(StatusCode::OK, ());
}
let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
let flush = async {
@@ -2065,11 +2077,18 @@ async fn disk_usage_eviction_run(
};
let state = get_state(&r);
let Some(storage) = state.remote_storage.as_ref() else {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run eviction iteration"
)));
};
let eviction_state = state.disk_usage_eviction_state.clone();
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&eviction_state,
&state.remote_storage,
storage,
usage,
&state.tenant_manager,
config.eviction_order,
@@ -2106,23 +2125,29 @@ async fn tenant_scan_remote_handler(
let state = get_state(&request);
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let Some(remote_storage) = state.remote_storage.as_ref() else {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Remote storage not configured"
)));
};
let mut response = TenantScanRemoteStorageResponse::default();
let (shards, _other_keys) =
list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone())
list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
for tenant_shard_id in shards {
let (timeline_ids, _other_keys) =
list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone())
list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
.await
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
let mut generation = Generation::none();
for timeline_id in timeline_ids {
match download_index_part(
&state.remote_storage,
remote_storage,
&tenant_shard_id,
&timeline_id,
Generation::MAX,
@@ -2333,162 +2358,6 @@ async fn get_utilization(
.map_err(ApiError::InternalServerError)
}
async fn list_aux_files(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let body: ListAuxFilesRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let process = || async move {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let files = timeline.list_aux_files(body.lsn, &ctx).await?;
Ok::<_, anyhow::Error>(files)
};
match process().await {
Ok(st) => json_response(StatusCode::OK, st),
Err(err) => json_response(
StatusCode::INTERNAL_SERVER_ERROR,
ApiError::InternalServerError(err).to_string(),
),
}
}
async fn ingest_aux_files(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let body: IngestAuxFilesRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let process = || async move {
let mut modification = timeline.begin_modification(Lsn(
timeline.get_last_record_lsn().0 + 8
) /* advance LSN by 8 */);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
for (fname, content) in body.aux_files {
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await?;
}
modification.commit(&ctx).await?;
Ok::<_, anyhow::Error>(())
};
match process().await {
Ok(st) => json_response(StatusCode::OK, st),
Err(err) => Err(ApiError::InternalServerError(err)),
}
}
/// Report on the largest tenants on this pageserver, for the storage controller to identify
/// candidates for splitting
async fn post_top_tenants(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
let request: TopTenantShardsRequest = json_request(&mut r).await?;
let state = get_state(&r);
fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 {
match order_by {
TenantSorting::ResidentSize => sizes.resident_size,
TenantSorting::MaxLogicalSize => sizes.max_logical_size,
}
}
#[derive(Eq, PartialEq)]
struct HeapItem {
metric: u64,
sizes: TopTenantShardItem,
}
impl PartialOrd for HeapItem {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
/// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which
/// supports popping the greatest item but not the smallest.
impl Ord for HeapItem {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
Reverse(self.metric).cmp(&Reverse(other.metric))
}
}
let mut top_n: BinaryHeap<HeapItem> = BinaryHeap::with_capacity(request.limit);
// FIXME: this is a lot of clones to take this tenant list
for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() {
if let Some(shards_lt) = request.where_shards_lt {
// Ignore tenants which already have >= this many shards
if tenant_shard_id.shard_count >= shards_lt {
continue;
}
}
let sizes = match tenant_slot {
TenantSlot::Attached(tenant) => tenant.get_sizes(),
TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
continue;
}
};
let metric = get_size_metric(&sizes, &request.order_by);
if let Some(gt) = request.where_gt {
// Ignore tenants whose metric is <= the lower size threshold, to do less sorting work
if metric <= gt {
continue;
}
};
match top_n.peek() {
None => {
// Top N list is empty: candidate becomes first member
top_n.push(HeapItem { metric, sizes });
}
Some(i) if i.metric > metric && top_n.len() < request.limit => {
// Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end
top_n.push(HeapItem { metric, sizes });
}
Some(i) if i.metric > metric => {
// List is at limit and lowest value is greater than our candidate, drop it.
}
Some(_) => top_n.push(HeapItem { metric, sizes }),
}
while top_n.len() > request.limit {
top_n.pop();
}
}
json_response(
StatusCode::OK,
TopTenantShardsResponse {
shards: top_n.into_iter().map(|i| i.sizes).collect(),
},
)
}
/// Common functionality of all the HTTP API handlers.
///
/// - Adds a tracing span to each request (by `request_span`)
@@ -2775,14 +2644,5 @@ pub fn make_router(
)
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
.get("/v1/utilization", |r| api_handler(r, get_utilization))
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
|r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
|r| testing_api_handler("list_aux_files", r, list_aux_files),
)
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
.any(handler_404))
}

View File

@@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics;
#[tracing::instrument(skip_all, fields(%exit_code))]
pub async fn shutdown_pageserver(
tenant_manager: &TenantManager,
mut deletion_queue: DeletionQueue,
deletion_queue: Option<DeletionQueue>,
exit_code: i32,
) {
use std::time::Duration;
@@ -89,7 +89,9 @@ pub async fn shutdown_pageserver(
.await;
// Best effort to persist any outstanding deletions, to avoid leaking objects
deletion_queue.shutdown(Duration::from_secs(5)).await;
if let Some(mut deletion_queue) = deletion_queue {
deletion_queue.shutdown(Duration::from_secs(5)).await;
}
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
@@ -112,6 +114,10 @@ pub async fn shutdown_pageserver(
std::process::exit(exit_code);
}
/// The name of the metadata file pageserver creates per timeline.
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
pub const METADATA_FILE_NAME: &str = "metadata";
/// Per-tenant configuration file.
/// Full path: `tenants/<tenant_id>/config`.
pub(crate) const TENANT_CONFIG_NAME: &str = "config";

View File

@@ -585,15 +585,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define current logical size metric")
});
static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_aux_file_estimated_size",
"The size of all aux files for a timeline in aux file v2 store.",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
@@ -1999,6 +1990,29 @@ impl Default for WalRedoProcessCounters {
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
Lazy::new(WalRedoProcessCounters::default);
#[cfg(not(test))]
pub mod wal_redo {
use super::*;
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
std::sync::Mutex::new(
register_uint_gauge_vec!(
"pageserver_wal_redo_process_kind",
"The configured process kind for walredo",
&["kind"],
)
.unwrap(),
)
});
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
// use guard to avoid races around the next two steps
let guard = PROCESS_KIND.lock().unwrap();
guard.reset();
guard.with_label_values(&[&format!("{kind}")]).set(1);
}
}
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
pub(crate) struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics,
@@ -2098,10 +2112,9 @@ pub(crate) struct TimelineMetrics {
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub aux_file_size_gauge: IntGauge,
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
pub evictions: IntCounter,
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
@@ -2174,9 +2187,6 @@ impl TimelineMetrics {
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let aux_file_size_gauge = AUX_FILE_SIZE
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
// TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
let directory_entries_count_gauge_closure = {
let tenant_shard_id = *tenant_shard_id;
@@ -2214,7 +2224,6 @@ impl TimelineMetrics {
last_record_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
aux_file_size_gauge,
directory_entries_count_gauge,
evictions,
evictions_with_low_residence_duration: std::sync::RwLock::new(
@@ -2255,7 +2264,6 @@ impl TimelineMetrics {
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
self.evictions_with_low_residence_duration
.write()
@@ -2312,7 +2320,6 @@ use pin_project_lite::pin_project;
use std::collections::HashMap;
use std::num::NonZeroUsize;
use std::pin::Pin;
use std::sync::atomic::AtomicU64;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::{Duration, Instant};
@@ -2322,35 +2329,35 @@ use crate::task_mgr::TaskKind;
use crate::tenant::mgr::TenantSlot;
/// Maintain a per timeline gauge in addition to the global gauge.
pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
last_set: AtomicU64,
struct PerTimelineRemotePhysicalSizeGauge {
last_set: u64,
gauge: UIntGauge,
}
impl PerTimelineRemotePhysicalSizeGauge {
fn new(per_timeline_gauge: UIntGauge) -> Self {
Self {
last_set: AtomicU64::new(0),
last_set: per_timeline_gauge.get(),
gauge: per_timeline_gauge,
}
}
pub(crate) fn set(&self, sz: u64) {
fn set(&mut self, sz: u64) {
self.gauge.set(sz);
let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
if sz < prev {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
if sz < self.last_set {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
} else {
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
};
self.last_set = sz;
}
pub(crate) fn get(&self) -> u64 {
fn get(&self) -> u64 {
self.gauge.get()
}
}
impl Drop for PerTimelineRemotePhysicalSizeGauge {
fn drop(&mut self) {
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
}
}
@@ -2358,7 +2365,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
tenant_id: String,
shard_id: String,
timeline_id: String,
pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -2366,27 +2373,38 @@ pub(crate) struct RemoteTimelineClientMetrics {
impl RemoteTimelineClientMetrics {
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
let tenant_id_str = tenant_shard_id.tenant_id.to_string();
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
let timeline_id_str = timeline_id.to_string();
let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
.unwrap(),
);
RemoteTimelineClientMetrics {
tenant_id: tenant_id_str,
shard_id: shard_id_str,
timeline_id: timeline_id_str,
tenant_id: tenant_shard_id.tenant_id.to_string(),
shard_id: format!("{}", tenant_shard_id.shard_slug()),
timeline_id: timeline_id.to_string(),
calls: Mutex::new(HashMap::default()),
bytes_started_counter: Mutex::new(HashMap::default()),
bytes_finished_counter: Mutex::new(HashMap::default()),
remote_physical_size_gauge,
remote_physical_size_gauge: Mutex::new(None),
}
}
pub(crate) fn remote_physical_size_set(&self, sz: u64) {
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
let gauge = guard.get_or_insert_with(|| {
PerTimelineRemotePhysicalSizeGauge::new(
REMOTE_PHYSICAL_SIZE
.get_metric_with_label_values(&[
&self.tenant_id,
&self.shard_id,
&self.timeline_id,
])
.unwrap(),
)
});
gauge.set(sz);
}
pub(crate) fn remote_physical_size_get(&self) -> u64 {
let guard = self.remote_physical_size_gauge.lock().unwrap();
guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
}
pub fn remote_operation_time(
&self,
file_kind: &RemoteOpFileKind,

View File

@@ -32,7 +32,6 @@ use std::str;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::io::StreamReader;
@@ -50,6 +49,7 @@ use utils::{
use crate::auth::check_permission;
use crate::basebackup;
use crate::basebackup::BasebackupError;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
@@ -59,15 +59,13 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::mgr;
use crate::tenant::mgr::get_active_tenant_with_timeout;
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::GetTenantError;
use crate::tenant::mgr::ShardResolveResult;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::mgr::TenantManager;
use crate::tenant::timeline::WaitLsnError;
use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
use crate::tenant::Tenant;
use crate::tenant::Timeline;
use crate::trace::Tracer;
use pageserver_api::key::rel_block_to_key;
@@ -137,7 +135,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
/// Listens for connections, and launches a new handler task for each.
///
pub async fn libpq_listener_main(
tenant_manager: Arc<TenantManager>,
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>,
listener: TcpListener,
@@ -182,7 +180,7 @@ pub async fn libpq_listener_main(
"serving compute connection task",
false,
page_service_conn_main(
tenant_manager.clone(),
conf,
broker_client.clone(),
local_auth,
socket,
@@ -205,7 +203,7 @@ pub async fn libpq_listener_main(
#[instrument(skip_all, fields(peer_addr))]
async fn page_service_conn_main(
tenant_manager: Arc<TenantManager>,
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>,
socket: tokio::net::TcpStream,
@@ -262,8 +260,7 @@ async fn page_service_conn_main(
// and create a child per-query context when it invokes process_query.
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
// and create the per-query context in process_query ourselves.
let mut conn_handler =
PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
match pgbackend
@@ -294,12 +291,11 @@ struct HandlerTimeline {
}
struct PageServerHandler {
_conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>,
claims: Option<Claims>,
tenant_manager: Arc<TenantManager>,
/// The context created for the lifetime of the connection
/// services by this PageServerHandler.
/// For each query received over the connection,
@@ -385,13 +381,13 @@ impl From<WaitLsnError> for QueryError {
impl PageServerHandler {
pub fn new(
tenant_manager: Arc<TenantManager>,
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<SwappableJwtAuth>>,
connection_ctx: RequestContext,
) -> Self {
PageServerHandler {
tenant_manager,
_conf: conf,
broker_client,
auth,
claims: None,
@@ -556,9 +552,13 @@ impl PageServerHandler {
{
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
let tenant = self
.get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
.await?;
let tenant = mgr::get_active_tenant_with_timeout(
tenant_id,
ShardSelector::First,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
// Make request tracer if needed
let mut tracer = if tenant.get_trace_read_requests() {
@@ -726,9 +726,13 @@ impl PageServerHandler {
// Create empty timeline
info!("creating new timeline");
let tenant = self
.get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
.await?;
let tenant = get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
let timeline = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
.await?;
@@ -1366,69 +1370,18 @@ impl PageServerHandler {
timeline_id: TimelineId,
selector: ShardSelector,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = self
.get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let tenant = get_active_tenant_with_timeout(
tenant_id,
selector,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant.get_timeline(timeline_id, true)?;
set_tracing_field_shard_id(&timeline);
Ok(timeline)
}
/// Get a shard's [`Tenant`] in its active state, if present. If we don't find the shard and some
/// slots for this tenant are `InProgress` then we will wait.
/// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
///
/// `timeout` is used as a total timeout for the whole wait operation.
async fn get_active_tenant_with_timeout(
&self,
tenant_id: TenantId,
shard_selector: ShardSelector,
timeout: Duration,
) -> Result<Arc<Tenant>, GetActiveTenantError> {
let wait_start = Instant::now();
let deadline = wait_start + timeout;
// Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is
// for handling the rare case that the slot we're accessing is InProgress.
let tenant_shard = loop {
let resolved = self
.tenant_manager
.resolve_attached_shard(&tenant_id, shard_selector);
match resolved {
ShardResolveResult::Found(tenant_shard) => break tenant_shard,
ShardResolveResult::NotFound => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
)));
}
ShardResolveResult::InProgress(barrier) => {
// We can't authoritatively answer right now: wait for InProgress state
// to end, then try again
tokio::select! {
_ = self.await_connection_cancelled() => {
return Err(GetActiveTenantError::Cancelled)
},
_ = barrier.wait() => {
// The barrier completed: proceed around the loop to try looking up again
},
_ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
return Err(GetActiveTenantError::WaitForActiveTimeout {
latest_state: None,
wait_time: timeout,
});
}
}
}
};
};
tracing::debug!("Waiting for tenant to enter active state...");
tenant_shard
.wait_to_become_active(deadline.duration_since(Instant::now()))
.await?;
Ok(tenant_shard)
}
}
#[async_trait::async_trait]
@@ -1818,13 +1771,13 @@ where
self.check_permission(Some(tenant_id))?;
let tenant = self
.get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
)
.await?;
let tenant = get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"checkpoint_distance"),
RowDescriptor::int8_col(b"checkpoint_timeout"),

View File

@@ -35,7 +35,7 @@ use std::ops::ControlFlow;
use std::ops::Range;
use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, trace, warn};
use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -699,17 +699,13 @@ impl Timeline {
.await
.context("scan")?;
let mut result = HashMap::new();
let mut sz = 0;
for (_, v) in kv {
let v = v.context("get value")?;
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
for (fname, content) in v {
sz += fname.len();
sz += content.len();
result.insert(fname, content);
}
}
self.aux_file_size_estimator.on_base_backup(sz);
Ok(result)
}
@@ -718,11 +714,10 @@ impl Timeline {
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
let current_policy = self.last_aux_file_policy.load();
match current_policy {
Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
Some(AuxFilePolicy::CrossValidation) => {
match self.get_switch_aux_file_policy() {
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
AuxFilePolicy::CrossValidation => {
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
match (v1_result, v2_result) {
@@ -1470,27 +1465,7 @@ impl<'a> DatadirModification<'a> {
content: &[u8],
ctx: &RequestContext,
) -> anyhow::Result<()> {
let switch_policy = self.tline.get_switch_aux_file_policy();
let policy = {
let current_policy = self.tline.last_aux_file_policy.load();
// Allowed switch path:
// * no aux files -> v1/v2/cross-validation
// * cross-validation->v2
if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
self.tline.last_aux_file_policy.store(Some(switch_policy));
self.tline
.remote_client
.schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
switch_policy
} else {
// This branch handles non-valid migration path, and the case that switch_policy == current_policy.
// And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
}
};
let policy = self.tline.get_switch_aux_file_policy();
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
let key = aux_file::encode_aux_file_key(path);
// retrieve the key from the engine
@@ -1499,45 +1474,23 @@ impl<'a> DatadirModification<'a> {
Err(PageReconstructError::MissingKey(_)) => None,
Err(e) => return Err(e.into()),
};
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
let files = if let Some(ref old_val) = old_val {
aux_file::decode_file_value(old_val)?
} else {
Vec::new()
};
let mut other_files = Vec::with_capacity(files.len());
let mut modifying_file = None;
for file @ (p, content) in files {
if path == p {
assert!(
modifying_file.is_none(),
"duplicated entries found for {}",
path
);
modifying_file = Some(content);
} else {
other_files.push(file);
}
}
let mut new_files = other_files;
match (modifying_file, content.is_empty()) {
(Some(old_content), false) => {
self.tline
.aux_file_size_estimator
.on_update(old_content.len(), content.len());
new_files.push((path, content));
}
(Some(old_content), true) => {
self.tline
.aux_file_size_estimator
.on_remove(old_content.len());
// not adding the file key to the final `new_files` vec.
}
(None, false) => {
self.tline.aux_file_size_estimator.on_add(content.len());
new_files.push((path, content));
}
(None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
}
let new_files = if content.is_empty() {
files
.into_iter()
.filter(|(p, _)| &path != p)
.collect::<Vec<_>>()
} else {
files
.into_iter()
.filter(|(p, _)| &path != p)
.chain(std::iter::once((path, content)))
.collect::<Vec<_>>()
};
let new_val = aux_file::encode_file_value(&new_files)?;
self.put(key, Value::Image(new_val.into()));
}
@@ -1718,7 +1671,7 @@ impl<'a> DatadirModification<'a> {
}
if !self.pending_deletions.is_empty() {
writer.delete_batch(&self.pending_deletions, ctx).await?;
writer.delete_batch(&self.pending_deletions).await?;
self.pending_deletions.clear();
}

View File

@@ -20,9 +20,7 @@ use futures::stream::FuturesUnordered;
use futures::FutureExt;
use futures::StreamExt;
use pageserver_api::models;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::TimelineState;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::WalRedoManagerStatus;
use pageserver_api::shard::ShardIdentity;
use pageserver_api::shard::ShardStripeSize;
@@ -192,7 +190,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
#[derive(Clone)]
pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub remote_storage: GenericRemoteStorage,
pub remote_storage: Option<GenericRemoteStorage>,
pub deletion_queue_client: DeletionQueueClient,
}
@@ -294,7 +292,7 @@ pub struct Tenant {
walredo_mgr: Option<Arc<WalRedoManager>>,
// provides access to timeline data sitting in the remote storage
pub(crate) remote_storage: GenericRemoteStorage,
pub(crate) remote_storage: Option<GenericRemoteStorage>,
// Access to global deletion queue for when this tenant wants to schedule a deletion
deletion_queue_client: DeletionQueueClient,
@@ -530,7 +528,6 @@ impl Tenant {
index_part: Option<IndexPart>,
metadata: TimelineMetadata,
ancestor: Option<Arc<Timeline>>,
last_aux_file_policy: Option<AuxFilePolicy>,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
let tenant_id = self.tenant_shard_id;
@@ -541,10 +538,6 @@ impl Tenant {
ancestor.clone(),
resources,
CreateTimelineCause::Load,
// This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
// there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
// Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
last_aux_file_policy,
)?;
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!(
@@ -558,26 +551,21 @@ impl Tenant {
);
if let Some(index_part) = index_part.as_ref() {
timeline.remote_client.init_upload_queue(index_part)?;
timeline
.last_aux_file_policy
.store(index_part.last_aux_file_policy());
} else {
.remote_client
.as_ref()
.unwrap()
.init_upload_queue(index_part)?;
} else if self.remote_storage.is_some() {
// No data on the remote storage, but we have local metadata file. We can end up
// here with timeline_create being interrupted before finishing index part upload.
// By doing what we do here, the index part upload is retried.
// If control plane retries timeline creation in the meantime, the mgmt API handler
// for timeline creation will coalesce on the upload we queue here.
// FIXME: this branch should be dead code as we no longer write local metadata.
timeline
.remote_client
.init_upload_queue_for_empty_remote(&metadata)?;
timeline
.remote_client
.schedule_index_upload_for_full_metadata_update(&metadata)?;
let rtc = timeline.remote_client.as_ref().unwrap();
rtc.init_upload_queue_for_empty_remote(&metadata)?;
rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
}
timeline
@@ -789,14 +777,14 @@ impl Tenant {
AttachType::Normal
};
let preload = match &mode {
SpawnMode::Create => {
let preload = match (&mode, &remote_storage) {
(SpawnMode::Create, _) => {
None
},
SpawnMode::Eager | SpawnMode::Lazy => {
(SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
let _preload_timer = TENANT.preload.start_timer();
let res = tenant_clone
.preload(&remote_storage, task_mgr::shutdown_token())
.preload(remote_storage, task_mgr::shutdown_token())
.await;
match res {
Ok(p) => Some(p),
@@ -806,7 +794,10 @@ impl Tenant {
}
}
}
(_, None) => {
let _preload_timer = TENANT.preload.start_timer();
None
}
};
// Remote preload is complete.
@@ -1030,7 +1021,7 @@ impl Tenant {
index_part,
remote_metadata,
TimelineResources {
remote_client,
remote_client: Some(remote_client),
deletion_queue_client: self.deletion_queue_client.clone(),
timeline_get_throttle: self.timeline_get_throttle.clone(),
},
@@ -1056,7 +1047,7 @@ impl Tenant {
Arc::clone(self),
timeline_id,
&index_part.metadata,
remote_timeline_client,
Some(remote_timeline_client),
self.deletion_queue_client.clone(),
)
.instrument(tracing::info_span!("timeline_delete", %timeline_id))
@@ -1148,7 +1139,9 @@ impl Tenant {
let mut size = 0;
for timeline in self.list_timelines() {
size += timeline.remote_client.get_remote_physical_size();
if let Some(remote_client) = &timeline.remote_client {
size += remote_client.get_remote_physical_size();
}
}
size
@@ -1183,15 +1176,12 @@ impl Tenant {
None
};
let last_aux_file_policy = index_part.last_aux_file_policy();
self.timeline_init_and_sync(
timeline_id,
resources,
Some(index_part),
remote_metadata,
ancestor,
last_aux_file_policy,
ctx,
)
.await
@@ -1201,7 +1191,6 @@ impl Tenant {
pub fn create_broken_tenant(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
remote_storage: GenericRemoteStorage,
reason: String,
) -> Arc<Tenant> {
Arc::new(Tenant::new(
@@ -1216,7 +1205,7 @@ impl Tenant {
ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
None,
tenant_shard_id,
remote_storage,
None,
DeletionQueueClient::broken(),
))
}
@@ -1371,7 +1360,6 @@ impl Tenant {
create_guard,
initdb_lsn,
None,
None,
)
.await
}
@@ -1410,7 +1398,13 @@ impl Tenant {
tline.freeze_and_flush().await.context("freeze_and_flush")?;
// Make sure the freeze_and_flush reaches remote storage.
tline.remote_client.wait_completion().await.unwrap();
tline
.remote_client
.as_ref()
.unwrap()
.wait_completion()
.await
.unwrap();
let tl = uninit_tl.finish_creation()?;
// The non-test code would call tl.activate() here.
@@ -1476,19 +1470,20 @@ impl Tenant {
return Err(CreateTimelineError::Conflict);
}
// Wait for uploads to complete, so that when we return Ok, the timeline
// is known to be durable on remote storage. Just like we do at the end of
// this function, after we have created the timeline ourselves.
//
// We only really care that the initial version of `index_part.json` has
// been uploaded. That's enough to remember that the timeline
// exists. However, there is no function to wait specifically for that so
// we just wait for all in-progress uploads to finish.
existing
.remote_client
.wait_completion()
.await
.context("wait for timeline uploads to complete")?;
if let Some(remote_client) = existing.remote_client.as_ref() {
// Wait for uploads to complete, so that when we return Ok, the timeline
// is known to be durable on remote storage. Just like we do at the end of
// this function, after we have created the timeline ourselves.
//
// We only really care that the initial version of `index_part.json` has
// been uploaded. That's enough to remember that the timeline
// exists. However, there is no function to wait specifically for that so
// we just wait for all in-progress uploads to finish.
remote_client
.wait_completion()
.await
.context("wait for timeline uploads to complete")?;
}
return Ok(existing);
}
@@ -1564,14 +1559,14 @@ impl Tenant {
// the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must
// not send a success to the caller until it is. The same applies to handling retries,
// see the handling of [`TimelineExclusionError::AlreadyExists`] above.
let kind = ancestor_timeline_id
.map(|_| "branched")
.unwrap_or("bootstrapped");
loaded_timeline
.remote_client
.wait_completion()
.await
.with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
let kind = ancestor_timeline_id
.map(|_| "branched")
.unwrap_or("bootstrapped");
remote_client.wait_completion().await.with_context(|| {
format!("wait for {} timeline initial uploads to complete", kind)
})?;
}
loaded_timeline.activate(self.clone(), broker_client, None, ctx);
@@ -2166,26 +2161,32 @@ impl Tenant {
) -> anyhow::Result<()> {
let timelines = self.timelines.lock().unwrap().clone();
for timeline in timelines.values() {
let Some(tl_client) = &timeline.remote_client else {
anyhow::bail!("Remote storage is mandatory");
};
let Some(remote_storage) = &self.remote_storage else {
anyhow::bail!("Remote storage is mandatory");
};
// We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
// to ensure that they do not start a split if currently in the process of doing these.
// Upload an index from the parent: this is partly to provide freshness for the
// child tenants that will copy it, and partly for general ease-of-debugging: there will
// always be a parent shard index in the same generation as we wrote the child shard index.
timeline
.remote_client
.schedule_index_upload_for_file_changes()?;
timeline.remote_client.wait_completion().await?;
tl_client.schedule_index_upload_for_file_changes()?;
tl_client.wait_completion().await?;
// Shut down the timeline's remote client: this means that the indices we write
// for child shards will not be invalidated by the parent shard deleting layers.
timeline.remote_client.shutdown().await;
tl_client.shutdown().await;
// Download methods can still be used after shutdown, as they don't flow through the remote client's
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
// operation is rare, so it's simpler to just download it (and robustly guarantees that the index
// we use here really is the remotely persistent one).
let result = timeline.remote_client
let result = tl_client
.download_index_file(&self.cancel)
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
.await?;
@@ -2198,7 +2199,7 @@ impl Tenant {
for child_shard in child_shards {
upload_index_part(
&self.remote_storage,
remote_storage,
child_shard,
&timeline.timeline_id,
self.generation,
@@ -2211,31 +2212,6 @@ impl Tenant {
Ok(())
}
pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
let mut result = TopTenantShardItem {
id: self.tenant_shard_id,
resident_size: 0,
physical_size: 0,
max_logical_size: 0,
};
for timeline in self.timelines.lock().unwrap().values() {
result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
result.physical_size += timeline
.remote_client
.metrics
.remote_physical_size_gauge
.get();
result.max_logical_size = std::cmp::max(
result.max_logical_size,
timeline.metrics.current_logical_size_gauge.get(),
);
}
result
}
}
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2455,7 +2431,6 @@ impl Tenant {
ancestor: Option<Arc<Timeline>>,
resources: TimelineResources,
cause: CreateTimelineCause,
last_aux_file_policy: Option<AuxFilePolicy>,
) -> anyhow::Result<Arc<Timeline>> {
let state = match cause {
CreateTimelineCause::Load => {
@@ -2484,7 +2459,6 @@ impl Tenant {
resources,
pg_version,
state,
last_aux_file_policy,
self.cancel.child_token(),
);
@@ -2501,7 +2475,7 @@ impl Tenant {
shard_identity: ShardIdentity,
walredo_mgr: Option<Arc<WalRedoManager>>,
tenant_shard_id: TenantShardId,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
) -> Tenant {
let (state, mut rx) = watch::channel(state);
@@ -2826,7 +2800,7 @@ impl Tenant {
// See comments in [`Tenant::branch_timeline`] for more information about why branch
// creation task can run concurrently with timeline's GC iteration.
for timeline in gc_timelines {
if cancel.is_cancelled() {
if task_mgr::is_shutdown_requested() || cancel.is_cancelled() {
// We were requested to shut down. Stop and return with the progress we
// made.
break;
@@ -3135,7 +3109,6 @@ impl Tenant {
timeline_create_guard,
start_lsn + 1,
Some(Arc::clone(src_timeline)),
src_timeline.last_aux_file_policy.load(),
)
.await?;
@@ -3146,10 +3119,11 @@ impl Tenant {
// We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
// could get incorrect information and remove more layers, than needed.
// See also https://github.com/neondatabase/neon/issues/3865
new_timeline
.remote_client
.schedule_index_upload_for_full_metadata_update(&metadata)
.context("branch initial metadata upload")?;
if let Some(remote_client) = new_timeline.remote_client.as_ref() {
remote_client
.schedule_index_upload_for_full_metadata_update(&metadata)
.context("branch initial metadata upload")?;
}
Ok(new_timeline)
}
@@ -3181,6 +3155,11 @@ impl Tenant {
pgdata_path: &Utf8PathBuf,
timeline_id: &TimelineId,
) -> anyhow::Result<()> {
let Some(storage) = &self.remote_storage else {
// No remote storage? No upload.
return Ok(());
};
let temp_path = timelines_path.join(format!(
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
));
@@ -3204,7 +3183,7 @@ impl Tenant {
backoff::retry(
|| async {
self::remote_timeline_client::upload_initdb_dir(
&self.remote_storage,
storage,
&self.tenant_shard_id.tenant_id,
timeline_id,
pgdata_zstd.try_clone().await?,
@@ -3261,6 +3240,9 @@ impl Tenant {
}
}
if let Some(existing_initdb_timeline_id) = load_existing_initdb {
let Some(storage) = &self.remote_storage else {
bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
};
if existing_initdb_timeline_id != timeline_id {
let source_path = &remote_initdb_archive_path(
&self.tenant_shard_id.tenant_id,
@@ -3270,7 +3252,7 @@ impl Tenant {
&remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
// if this fails, it will get retried by retried control plane requests
self.remote_storage
storage
.copy_object(source_path, dest_path, &self.cancel)
.await
.context("copy initdb tar")?;
@@ -3278,7 +3260,7 @@ impl Tenant {
let (initdb_tar_zst_path, initdb_tar_zst) =
self::remote_timeline_client::download_initdb_tar_zst(
self.conf,
&self.remote_storage,
storage,
&self.tenant_shard_id,
&existing_initdb_timeline_id,
&self.cancel,
@@ -3329,7 +3311,6 @@ impl Tenant {
timeline_create_guard,
pgdata_lsn,
None,
None,
)
.await?;
@@ -3374,14 +3355,20 @@ impl Tenant {
/// Call this before constructing a timeline, to build its required structures
fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
let remote_client = RemoteTimelineClient::new(
self.remote_storage.clone(),
self.deletion_queue_client.clone(),
self.conf,
self.tenant_shard_id,
timeline_id,
self.generation,
);
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
let remote_client = RemoteTimelineClient::new(
remote_storage.clone(),
self.deletion_queue_client.clone(),
self.conf,
self.tenant_shard_id,
timeline_id,
self.generation,
);
Some(remote_client)
} else {
None
};
TimelineResources {
remote_client,
deletion_queue_client: self.deletion_queue_client.clone(),
@@ -3401,14 +3388,13 @@ impl Tenant {
create_guard: TimelineCreateGuard<'a>,
start_lsn: Lsn,
ancestor: Option<Arc<Timeline>>,
last_aux_file_policy: Option<AuxFilePolicy>,
) -> anyhow::Result<UninitializedTimeline> {
let tenant_shard_id = self.tenant_shard_id;
let resources = self.build_timeline_resources(new_timeline_id);
resources
.remote_client
.init_upload_queue_for_empty_remote(new_metadata)?;
if let Some(remote_client) = &resources.remote_client {
remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
}
let timeline_struct = self
.create_timeline_struct(
@@ -3417,7 +3403,6 @@ impl Tenant {
ancestor,
resources,
CreateTimelineCause::Load,
last_aux_file_policy,
)
.context("Failed to create timeline data structure")?;
@@ -3577,7 +3562,9 @@ impl Tenant {
tracing::info!(timeline_id=%timeline.timeline_id, "Flushing...");
timeline.freeze_and_flush().await?;
tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads...");
timeline.remote_client.wait_completion().await?;
if let Some(client) = &timeline.remote_client {
client.wait_completion().await?;
}
Ok(())
}
@@ -3891,7 +3878,7 @@ pub(crate) mod harness {
ShardIdentity::unsharded(),
Some(walredo_mgr),
self.tenant_shard_id,
self.remote_storage.clone(),
Some(self.remote_storage.clone()),
self.deletion_queue.new_client(),
));
@@ -5641,280 +5628,4 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_branch_copies_dirty_aux_file_flag() {
let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
// the default aux file policy to switch is v1 if not set by the admins
assert_eq!(
harness.tenant_conf.switch_aux_file_policy,
AuxFilePolicy::V1
);
let (tenant, ctx) = harness.load().await;
let mut lsn = Lsn(0x08);
let tline: Arc<Timeline> = tenant
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
// no aux file is written at this point, so the persistent flag should be unset
assert_eq!(tline.last_aux_file_policy.load(), None);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test1", b"first", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
// there is no tenant manager to pass the configuration through, so lets mimic it
tenant.set_new_location_config(
AttachedTenantConf::try_from(LocationConf::attached_single(
TenantConfOpt {
switch_aux_file_policy: Some(AuxFilePolicy::V2),
..Default::default()
},
tenant.generation,
&pageserver_api::models::ShardParameters::default(),
))
.unwrap(),
);
assert_eq!(
tline.get_switch_aux_file_policy(),
AuxFilePolicy::V2,
"wanted state has been updated"
);
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::V1),
"aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
);
// we can read everything from the storage
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(
files.get("pg_logical/mappings/test1"),
Some(&bytes::Bytes::from_static(b"first"))
);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test2", b"second", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::V1),
"keep v1 storage format when new files are written"
);
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(
files.get("pg_logical/mappings/test2"),
Some(&bytes::Bytes::from_static(b"second"))
);
let child = tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
.await
.unwrap();
// child copies the last flag even if that is not on remote storage yet
assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
let files = child.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(files.get("pg_logical/mappings/test1"), None);
assert_eq!(files.get("pg_logical/mappings/test2"), None);
// even if we crash here without flushing parent timeline with it's new
// last_aux_file_policy we are safe, because child was never meant to access ancestor's
// files. the ancestor can even switch back to V1 because of a migration safely.
}
#[tokio::test]
async fn aux_file_policy_switch() {
let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
let (tenant, ctx) = harness.load().await;
let mut lsn = Lsn(0x08);
let tline: Arc<Timeline> = tenant
.create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
assert_eq!(
tline.last_aux_file_policy.load(),
None,
"no aux file is written so it should be unset"
);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test1", b"first", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
// there is no tenant manager to pass the configuration through, so lets mimic it
tenant.set_new_location_config(
AttachedTenantConf::try_from(LocationConf::attached_single(
TenantConfOpt {
switch_aux_file_policy: Some(AuxFilePolicy::V2),
..Default::default()
},
tenant.generation,
&pageserver_api::models::ShardParameters::default(),
))
.unwrap(),
);
assert_eq!(
tline.get_switch_aux_file_policy(),
AuxFilePolicy::V2,
"wanted state has been updated"
);
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::CrossValidation),
"dirty index_part.json reflected state is yet to be updated"
);
// we can still read the auxfile v1 before we ingest anything new
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(
files.get("pg_logical/mappings/test1"),
Some(&bytes::Bytes::from_static(b"first"))
);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test2", b"second", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::V2),
"ingesting a file should apply the wanted switch state when applicable"
);
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(
files.get("pg_logical/mappings/test1"),
Some(&bytes::Bytes::from_static(b"first")),
"cross validation writes to both v1 and v2 so this should be available in v2"
);
assert_eq!(
files.get("pg_logical/mappings/test2"),
Some(&bytes::Bytes::from_static(b"second"))
);
// mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
tenant.set_new_location_config(
AttachedTenantConf::try_from(LocationConf::attached_single(
TenantConfOpt {
switch_aux_file_policy: Some(AuxFilePolicy::V1),
..Default::default()
},
tenant.generation,
&pageserver_api::models::ShardParameters::default(),
))
.unwrap(),
);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test2", b"third", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
assert_eq!(
tline.get_switch_aux_file_policy(),
AuxFilePolicy::V1,
"wanted state has been updated again, even if invalid request"
);
assert_eq!(
tline.last_aux_file_policy.load(),
Some(AuxFilePolicy::V2),
"ingesting a file should apply the wanted switch state when applicable"
);
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(
files.get("pg_logical/mappings/test1"),
Some(&bytes::Bytes::from_static(b"first"))
);
assert_eq!(
files.get("pg_logical/mappings/test2"),
Some(&bytes::Bytes::from_static(b"third"))
);
// mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
tenant.set_new_location_config(
AttachedTenantConf::try_from(LocationConf::attached_single(
TenantConfOpt {
switch_aux_file_policy: Some(AuxFilePolicy::V2),
..Default::default()
},
tenant.generation,
&pageserver_api::models::ShardParameters::default(),
))
.unwrap(),
);
{
lsn += 8;
let mut modification = tline.begin_modification(lsn);
modification
.put_file("pg_logical/mappings/test3", b"last", &ctx)
.await
.unwrap();
modification.commit(&ctx).await.unwrap();
}
assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
assert_eq!(
files.get("pg_logical/mappings/test1"),
Some(&bytes::Bytes::from_static(b"first"))
);
assert_eq!(
files.get("pg_logical/mappings/test2"),
Some(&bytes::Bytes::from_static(b"third"))
);
assert_eq!(
files.get("pg_logical/mappings/test3"),
Some(&bytes::Bytes::from_static(b"last"))
);
}
}

View File

@@ -299,7 +299,7 @@ mod tests {
// Write part (in block to drop the file)
let mut offsets = Vec::new();
{
let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
let file = VirtualFile::create(pathbuf.as_path()).await?;
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
@@ -314,7 +314,7 @@ mod tests {
wtr.flush_buffer(&ctx).await?;
}
let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
let file = VirtualFile::open(pathbuf.as_path()).await?;
let rdr = BlockReaderRef::VirtualFile(&file);
let rdr = BlockCursor::new(rdr);
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {

View File

@@ -102,7 +102,7 @@ impl<'a> BlockReaderRef<'a> {
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
#[cfg(test)]
VirtualFile(r) => r.read_blk(blknum, ctx).await,
VirtualFile(r) => r.read_blk(blknum).await,
}
}
}
@@ -177,11 +177,10 @@ impl<'a> FileBlockReader<'a> {
&self,
buf: PageWriteGuard<'static>,
blkno: u32,
ctx: &RequestContext,
) -> Result<PageWriteGuard<'static>, std::io::Error> {
assert!(buf.len() == PAGE_SZ);
self.file
.read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx)
.read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
.await
}
/// Read a block.
@@ -207,7 +206,7 @@ impl<'a> FileBlockReader<'a> {
ReadBufResult::Found(guard) => Ok(guard.into()),
ReadBufResult::NotFound(write_guard) => {
// Read the page from disk into the buffer
let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?;
let write_guard = self.fill_buffer(write_guard, blknum).await?;
Ok(write_guard.mark_valid().into())
}
}

View File

@@ -373,8 +373,6 @@ pub struct TenantConf {
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
/// file is written.
pub switch_aux_file_policy: AuxFilePolicy,
}
@@ -576,7 +574,7 @@ impl Default for TenantConf {
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
switch_aux_file_policy: AuxFilePolicy::V1,
}
}
}

View File

@@ -181,23 +181,25 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
async fn remove_tenant_remote_delete_mark(
conf: &PageServerConf,
remote_storage: &GenericRemoteStorage,
remote_storage: Option<&GenericRemoteStorage>,
tenant_shard_id: &TenantShardId,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
backoff::retry(
|| async { remote_storage.delete(&path, cancel).await },
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("remove_tenant_remote_delete_mark")?;
if let Some(remote_storage) = remote_storage {
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
backoff::retry(
|| async { remote_storage.delete(&path, cancel).await },
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("remove_tenant_remote_delete_mark")?;
}
Ok(())
}
@@ -295,7 +297,7 @@ impl DeleteTenantFlow {
#[instrument(skip_all)]
pub(crate) async fn run(
conf: &'static PageServerConf,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
cancel: &CancellationToken,
@@ -306,7 +308,9 @@ impl DeleteTenantFlow {
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
if let Err(e) =
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
{
tenant.set_broken(format!("{e:#}")).await;
return Err(e);
}
@@ -323,7 +327,7 @@ impl DeleteTenantFlow {
async fn run_inner(
guard: &mut OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: &GenericRemoteStorage,
remote_storage: Option<&GenericRemoteStorage>,
tenant: &Tenant,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
@@ -335,9 +339,14 @@ impl DeleteTenantFlow {
))?
});
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
.await
.context("remote_mark")?;
// IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
.await
.context("remote_mark")?
}
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
Err(anyhow::anyhow!(
@@ -474,7 +483,7 @@ impl DeleteTenantFlow {
fn schedule_background(
guard: OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) {
@@ -503,7 +512,7 @@ impl DeleteTenantFlow {
async fn background(
mut guard: OwnedMutexGuard<Self>,
conf: &PageServerConf,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: &Arc<Tenant>,
) -> Result<(), DeleteTenantError> {
@@ -542,7 +551,7 @@ impl DeleteTenantFlow {
remove_tenant_remote_delete_mark(
conf,
&remote_storage,
remote_storage.as_ref(),
&tenant.tenant_shard_id,
&task_mgr::shutdown_token(),
)

View File

@@ -28,7 +28,6 @@ impl EphemeralFile {
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
ctx: &RequestContext,
) -> Result<EphemeralFile, io::Error> {
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
let filename_disambiguator =
@@ -46,7 +45,6 @@ impl EphemeralFile {
.read(true)
.write(true)
.create(true),
ctx,
)
.await?;
@@ -155,7 +153,7 @@ mod tests {
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
let pos_foo = file.write_blob(b"foo", &ctx).await?;
assert_eq!(

View File

@@ -78,7 +78,7 @@ impl RW {
page_cache::ReadBufResult::NotFound(write_guard) => {
let write_guard = writer
.file
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
.await?;
let read_guard = write_guard.mark_valid();
return Ok(BlockLease::PageReadGuard(read_guard));

View File

@@ -214,12 +214,12 @@ impl TimelineMetadata {
self.body.ancestor_timeline = Some(*timeline);
}
pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
if let Some(ancestor) = self.body.ancestor_timeline {
assert_eq!(ancestor, branchpoint.0);
assert_eq!(ancestor, *timeline);
}
if self.body.ancestor_lsn != Lsn(0) {
assert_eq!(self.body.ancestor_lsn, branchpoint.1);
assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
}
self.body.ancestor_timeline = None;
self.body.ancestor_lsn = Lsn(0);

View File

@@ -16,9 +16,10 @@ use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap};
use std::ops::Deref;
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use sysinfo::SystemExt;
use tokio::fs;
use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
use anyhow::Context;
use once_cell::sync::Lazy;
@@ -46,7 +47,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext::PathExt;
@@ -118,7 +119,6 @@ pub(crate) enum TenantsMapRemoveResult {
/// When resolving a TenantId to a shard, we may be looking for the 0th
/// shard, or we might be looking for whichever shard holds a particular page.
#[derive(Copy, Clone)]
pub(crate) enum ShardSelector {
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
/// ignore it.
@@ -169,14 +169,6 @@ impl TenantStartupMode {
}
}
/// Result type for looking up a TenantId to a specific shard
pub(crate) enum ShardResolveResult {
NotFound,
Found(Arc<Tenant>),
// Wait for this barrrier, then query again
InProgress(utils::completion::Barrier),
}
impl TenantsMap {
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
@@ -190,6 +182,51 @@ impl TenantsMap {
}
}
/// A page service client sends a TenantId, and to look up the correct Tenant we must
/// resolve this to a fully qualified TenantShardId.
fn resolve_attached_shard(
&self,
tenant_id: &TenantId,
selector: ShardSelector,
) -> Option<TenantShardId> {
let mut want_shard = None;
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
// Ignore all slots that don't contain an attached tenant
let tenant = match &slot.1 {
TenantSlot::Attached(t) => t,
_ => continue,
};
match selector {
ShardSelector::First => return Some(*slot.0),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return Some(*slot.0)
}
ShardSelector::Page(key) => {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(tenant.shard_identity.number) == want_shard {
return Some(*slot.0);
}
}
_ => continue,
}
}
// Fall through: we didn't find an acceptable shard
None
}
}
}
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
///
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -354,17 +391,22 @@ async fn init_load_generations(
// deletion list entries may still be valid. We provide that by pushing a recovery operation into
// the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
// are processed, even though we don't block on recovery completing here.
let attached_tenants = generations
.iter()
.flat_map(|(id, start_mode)| {
match start_mode {
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
TenantStartupMode::Secondary => None,
}
.map(|gen| (*id, *gen))
})
.collect();
resources.deletion_queue_client.recover(attached_tenants)?;
//
// Must only do this if remote storage is enabled, otherwise deletion queue
// is not running and channel push will fail.
if resources.remote_storage.is_some() {
let attached_tenants = generations
.iter()
.flat_map(|(id, start_mode)| {
match start_mode {
TenantStartupMode::Attached((_mode, generation)) => Some(generation),
TenantStartupMode::Secondary => None,
}
.map(|gen| (*id, *gen))
})
.collect();
resources.deletion_queue_client.recover(attached_tenants)?;
}
Ok(Some(generations))
}
@@ -418,6 +460,53 @@ fn load_tenant_config(
}
};
// Clean up legacy `metadata` files.
// Doing it here because every single tenant directory is visited here.
// In any later code, there's different treatment of tenant dirs
// ... depending on whether the tenant is in re-attach response or not
// ... epending on whether the tenant is ignored or not
assert_eq!(
&conf.tenant_path(&tenant_shard_id),
&tenant_dir_path,
"later use of conf....path() methods would be dubious"
);
let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
Ok(iter) => {
let mut timelines = Vec::new();
for res in iter {
let p = res?;
let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
// skip any entries that aren't TimelineId, such as
// - *.___temp dirs
// - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
continue;
};
timelines.push(timeline_id);
}
timelines
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
Err(e) => return Err(anyhow::anyhow!(e)),
};
for timeline_id in timelines {
let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
let metadata_path = timeline_path.join(METADATA_FILE_NAME);
match std::fs::remove_file(&metadata_path) {
Ok(()) => {
crashsafe::fsync(timeline_path)
.context("fsync timeline dir after removing legacy metadata file")?;
info!("removed legacy metadata file at {metadata_path}");
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// something removed the file earlier, or it was never there
// We don't care, this software version doesn't write it again, so, we're good.
}
Err(e) => {
anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
}
}
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
@@ -522,7 +611,6 @@ pub async fn init_tenant_mgr(
TenantSlot::Attached(Tenant::create_broken_tenant(
conf,
tenant_shard_id,
resources.remote_storage.clone(),
format!("{}", e),
)),
);
@@ -715,7 +803,6 @@ fn tenant_spawn(
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
);
let remote_storage = resources.remote_storage.clone();
let tenant = match Tenant::spawn(
conf,
tenant_shard_id,
@@ -730,7 +817,7 @@ fn tenant_spawn(
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
}
};
@@ -2016,72 +2103,6 @@ impl TenantManager {
Ok(reparented)
}
/// A page service client sends a TenantId, and to look up the correct Tenant we must
/// resolve this to a fully qualified TenantShardId.
///
/// During shard splits: we shall see parent shards in InProgress state and skip them, and
/// instead match on child shards which should appear in Attached state. Very early in a shard
/// split, or in other cases where a shard is InProgress, we will return our own InProgress result
/// to instruct the caller to wait for that to finish before querying again.
pub(crate) fn resolve_attached_shard(
&self,
tenant_id: &TenantId,
selector: ShardSelector,
) -> ShardResolveResult {
let tenants = self.tenants.read().unwrap();
let mut want_shard = None;
let mut any_in_progress = None;
match &*tenants {
TenantsMap::Initializing => ShardResolveResult::NotFound,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
// Ignore all slots that don't contain an attached tenant
let tenant = match &slot.1 {
TenantSlot::Attached(t) => t,
TenantSlot::InProgress(barrier) => {
// We might still find a usable shard, but in case we don't, remember that
// we saw at least one InProgress slot, so that we can distinguish this case
// from a simple NotFound in our return value.
any_in_progress = Some(barrier.clone());
continue;
}
_ => continue,
};
match selector {
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return ShardResolveResult::Found(tenant.clone())
}
ShardSelector::Page(key) => {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(tenant.shard_identity.number) == want_shard {
return ShardResolveResult::Found(tenant.clone());
}
}
_ => continue,
}
}
// Fall through: we didn't find a slot that was in Attached state & matched our selector. If
// we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise
// this requested shard simply isn't found.
if let Some(barrier) = any_in_progress {
ShardResolveResult::InProgress(barrier)
} else {
ShardResolveResult::NotFound
}
}
}
}
}
#[derive(Debug, thiserror::Error)]
@@ -2130,6 +2151,105 @@ pub(crate) enum GetActiveTenantError {
Broken(String),
}
/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
/// state, then wait for up to `timeout`. If the [`Tenant`] is not currently in [`TenantState::Active`],
/// then wait for up to `timeout` (minus however long we waited for the slot).
pub(crate) async fn get_active_tenant_with_timeout(
tenant_id: TenantId,
shard_selector: ShardSelector,
timeout: Duration,
cancel: &CancellationToken,
) -> Result<Arc<Tenant>, GetActiveTenantError> {
enum WaitFor {
Barrier(utils::completion::Barrier),
Tenant(Arc<Tenant>),
}
let wait_start = Instant::now();
let deadline = wait_start + timeout;
let (wait_for, tenant_shard_id) = {
let locked = TENANTS.read().unwrap();
// Resolve TenantId to TenantShardId
let tenant_shard_id = locked
.resolve_attached_shard(&tenant_id, shard_selector)
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
)))?;
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
.map_err(GetTenantError::MapState)?;
match peek_slot {
Some(TenantSlot::Attached(tenant)) => {
match tenant.current_state() {
TenantState::Active => {
// Fast path: we don't need to do any async waiting.
return Ok(tenant.clone());
}
_ => {
tenant.activate_now();
(WaitFor::Tenant(tenant.clone()), tenant_shard_id)
}
}
}
Some(TenantSlot::Secondary(_)) => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
tenant_shard_id,
)))
}
Some(TenantSlot::InProgress(barrier)) => {
(WaitFor::Barrier(barrier.clone()), tenant_shard_id)
}
None => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
)))
}
}
};
let tenant = match wait_for {
WaitFor::Barrier(barrier) => {
tracing::debug!("Waiting for tenant InProgress state to pass...");
timeout_cancellable(
deadline.duration_since(Instant::now()),
cancel,
barrier.wait(),
)
.await
.map_err(|e| match e {
TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout {
latest_state: None,
wait_time: wait_start.elapsed(),
},
TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled,
})?;
{
let locked = TENANTS.read().unwrap();
let peek_slot =
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
.map_err(GetTenantError::MapState)?;
match peek_slot {
Some(TenantSlot::Attached(tenant)) => tenant.clone(),
_ => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
tenant_shard_id,
)))
}
}
}
}
WaitFor::Tenant(tenant) => tenant,
};
tracing::debug!("Waiting for tenant to enter active state...");
tenant
.wait_to_become_active(deadline.duration_since(Instant::now()))
.await?;
Ok(tenant)
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum DeleteTimelineError {
#[error("Tenant {0}")]
@@ -2156,7 +2276,7 @@ pub(crate) async fn load_tenant(
tenant_id: TenantId,
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: GenericRemoteStorage,
remote_storage: Option<GenericRemoteStorage>,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
@@ -2760,73 +2880,86 @@ use {
utils::http::error::ApiError,
};
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
pub(crate) async fn immediate_gc(
pub(crate) fn immediate_gc(
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<GcResult, ApiError> {
let tenant = {
let guard = TENANTS.read().unwrap();
guard
.get(&tenant_shard_id)
.cloned()
.with_context(|| format!("tenant {tenant_shard_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?
};
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
let guard = TENANTS.read().unwrap();
let tenant = guard
.get(&tenant_shard_id)
.cloned()
.with_context(|| format!("tenant {tenant_shard_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?;
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
// Run in task_mgr to avoid race with tenant_detach operation
let ctx: RequestContext =
ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
// TODO: spawning is redundant now, need to hold the gate
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::GarbageCollector,
Some(tenant_shard_id),
Some(timeline_id),
&format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
false,
async move {
fail::fail_point!("immediate_gc_task_pre");
fail::fail_point!("immediate_gc_task_pre");
#[allow(unused_mut)]
let mut result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
#[allow(unused_mut)]
let mut result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
#[cfg(feature = "testing")]
{
// we need to synchronize with drop completion for python tests without polling for
// log messages
if let Ok(result) = result.as_mut() {
let mut js = tokio::task::JoinSet::new();
for layer in std::mem::take(&mut result.doomed_layers) {
js.spawn(layer.wait_drop());
}
tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
while let Some(res) = js.join_next().await {
res.expect("wait_drop should not panic");
}
}
#[cfg(feature = "testing")]
{
// we need to synchronize with drop completion for python tests without polling for
// log messages
if let Ok(result) = result.as_mut() {
let mut js = tokio::task::JoinSet::new();
for layer in std::mem::take(&mut result.doomed_layers) {
js.spawn(layer.wait_drop());
let timeline = tenant.get_timeline(timeline_id, false).ok();
let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
if let Some(rtc) = rtc {
// layer drops schedule actions on remote timeline client to actually do the
// deletions; don't care about the shutdown error, just exit fast
drop(rtc.wait_completion().await);
}
}
tracing::info!(
total = js.len(),
"starting to wait for the gc'd layers to be dropped"
);
while let Some(res) = js.join_next().await {
res.expect("wait_drop should not panic");
match task_done.send(result) {
Ok(_) => (),
Err(result) => error!("failed to send gc result: {result:?}"),
}
Ok(())
}
.instrument(span)
);
let timeline = tenant.get_timeline(timeline_id, false).ok();
let rtc = timeline.as_ref().map(|x| &x.remote_client);
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
drop(guard);
if let Some(rtc) = rtc {
// layer drops schedule actions on remote timeline client to actually do the
// deletions; don't care about the shutdown error, just exit fast
drop(rtc.wait_completion().await);
}
}
result.map_err(ApiError::InternalServerError)
Ok(wait_task_done)
}
#[cfg(test)]

View File

@@ -189,7 +189,6 @@ use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc};
pub(crate) use download::download_initdb_tar_zst;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::shard::{ShardIndex, TenantShardId};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
@@ -241,7 +240,7 @@ use utils::id::{TenantId, TimelineId};
use self::index::IndexPart;
use super::metadata::MetadataUpdate;
use super::storage_layer::{Layer, LayerName, ResidentLayer};
use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
use super::upload_queue::SetDeletedFlagProgress;
use super::Generation;
@@ -318,7 +317,7 @@ pub struct RemoteTimelineClient {
upload_queue: Mutex<UploadQueue>,
pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
metrics: Arc<RemoteTimelineClientMetrics>,
storage_impl: GenericRemoteStorage,
@@ -438,19 +437,6 @@ impl RemoteTimelineClient {
}
}
/// Returns true if this timeline was previously detached at this Lsn and the remote timeline
/// client is currently initialized.
pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
// technically this is a dirty read, but given how timeline detach ancestor is implemented
// via tenant restart, the lineage has always been uploaded.
self.upload_queue
.lock()
.unwrap()
.initialized_mut()
.map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
.unwrap_or(false)
}
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
current_remote_index_part
@@ -462,11 +448,11 @@ impl RemoteTimelineClient {
} else {
0
};
self.metrics.remote_physical_size_gauge.set(size);
self.metrics.remote_physical_size_set(size);
}
pub fn get_remote_physical_size(&self) -> u64 {
self.metrics.remote_physical_size_gauge.get()
self.metrics.remote_physical_size_get()
}
//
@@ -517,9 +503,8 @@ impl RemoteTimelineClient {
/// On success, returns the size of the downloaded file.
pub async fn download_layer_file(
&self,
layer_file_name: &LayerName,
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
local_path: &Utf8Path,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> anyhow::Result<u64> {
@@ -538,7 +523,6 @@ impl RemoteTimelineClient {
self.timeline_id,
layer_file_name,
layer_metadata,
local_path,
cancel,
ctx,
)
@@ -612,17 +596,6 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
self: &Arc<Self>,
last_aux_file_policy: Option<AuxFilePolicy>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.last_aux_file_policy = last_aux_file_policy;
self.schedule_index_upload(upload_queue);
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///
@@ -655,7 +628,7 @@ impl RemoteTimelineClient {
);
let index_part = IndexPart::from(&*upload_queue);
let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -674,14 +647,7 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
return Err(anyhow::anyhow!(
"cannot reparent without a current ancestor"
));
};
upload_queue.latest_metadata.reparent(new_parent);
upload_queue.latest_lineage.record_previous_ancestor(&prev);
self.schedule_index_upload(upload_queue);
@@ -704,13 +670,14 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.latest_metadata.detach_from_ancestor(&adopted);
upload_queue.latest_lineage.record_detaching(&adopted);
upload_queue
.latest_metadata
.detach_from_ancestor(&adopted.0, &adopted.1);
for layer in layers {
upload_queue
.latest_files
.insert(layer.layer_desc().layer_name(), layer.metadata());
.insert(layer.layer_desc().filename(), layer.metadata());
}
self.schedule_index_upload(upload_queue);
@@ -746,7 +713,7 @@ impl RemoteTimelineClient {
upload_queue
.latest_files
.insert(layer.layer_desc().layer_name(), metadata.clone());
.insert(layer.layer_desc().filename(), metadata.clone());
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
info!(
@@ -770,7 +737,7 @@ impl RemoteTimelineClient {
/// successfully.
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerName],
names: &[LayerFileName],
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -798,7 +765,7 @@ impl RemoteTimelineClient {
// the layer files as "dangling". this is fine, at worst case we create work for the
// scrubber.
let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
let names = gc_layers.iter().map(|x| x.layer_desc().filename());
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
@@ -813,9 +780,9 @@ impl RemoteTimelineClient {
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
names: I,
) -> Vec<(LayerName, LayerFileMetadata)>
) -> Vec<(LayerFileName, LayerFileMetadata)>
where
I: IntoIterator<Item = LayerName>,
I: IntoIterator<Item = LayerFileName>,
{
// Decorate our list of names with each name's metadata, dropping
// names that are unexpectedly missing from our metadata. This metadata
@@ -865,7 +832,7 @@ impl RemoteTimelineClient {
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
pub(crate) fn schedule_deletion_of_unlinked(
self: &Arc<Self>,
layers: Vec<(LayerName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -878,7 +845,7 @@ impl RemoteTimelineClient {
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
mut with_metadata: Vec<(LayerName, LayerFileMetadata)>,
mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
) {
// Filter out any layers which were not created by this tenant shard. These are
// layers that originate from some ancestor shard after a split, and may still
@@ -947,7 +914,7 @@ impl RemoteTimelineClient {
self.schedule_layer_file_upload0(upload_queue, layer.clone());
}
let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
let names = compacted_from.iter().map(|x| x.layer_desc().filename());
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
self.launch_queued_tasks(upload_queue);
@@ -1141,11 +1108,6 @@ impl RemoteTimelineClient {
Ok(())
}
pub(crate) fn is_deleting(&self) -> bool {
let mut locked = self.upload_queue.lock().unwrap();
locked.stopped_mut().is_ok()
}
pub(crate) async fn preserve_initdb_archive(
self: &Arc<Self>,
tenant_id: &TenantId,
@@ -1182,7 +1144,7 @@ impl RemoteTimelineClient {
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
self.tenant_shard_id.to_index(),
&uploaded.layer_desc().layer_name(),
&uploaded.layer_desc().filename(),
uploaded.metadata().generation,
);
@@ -1223,7 +1185,7 @@ impl RemoteTimelineClient {
.get_timeline_id()
.expect("Source timeline should be alive"),
self.tenant_shard_id.to_index(),
&adopted.layer_desc().layer_name(),
&adopted.layer_desc().filename(),
adopted.metadata().generation,
);
@@ -1231,7 +1193,7 @@ impl RemoteTimelineClient {
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
self.tenant_shard_id.to_index(),
&adopted_as.layer_desc().layer_name(),
&adopted_as.layer_desc().filename(),
adopted_as.metadata().generation,
);
@@ -1565,7 +1527,7 @@ impl RemoteTimelineClient {
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
layer_metadata.shard,
&layer.layer_desc().layer_name(),
&layer.layer_desc().filename(),
layer_metadata.generation,
);
@@ -1849,7 +1811,6 @@ impl RemoteTimelineClient {
latest_files: initialized.latest_files.clone(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: initialized.latest_metadata.clone(),
latest_lineage: initialized.latest_lineage.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: initialized
.visible_remote_consistent_lsn
@@ -1863,7 +1824,6 @@ impl RemoteTimelineClient {
dangling_files: HashMap::default(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
last_aux_file_policy: initialized.last_aux_file_policy,
};
let upload_queue = std::mem::replace(
@@ -1936,14 +1896,14 @@ pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
layer_file_name: &LayerName,
layer_file_name: &LayerFileName,
generation: Generation,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
shard.get_suffix(),
layer_file_name,
layer_file_name.file_name(),
generation.get_suffix()
);
@@ -2040,8 +2000,8 @@ mod tests {
TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
}
fn assert_file_list(a: &HashSet<LayerName>, b: &[&str]) {
let mut avec: Vec<String> = a.iter().map(|x| x.to_string()).collect();
fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
avec.sort();
let mut bvec = b.to_vec();
@@ -2152,7 +2112,7 @@ mod tests {
tenant_ctx: _tenant_ctx,
} = test_setup;
let client = &timeline.remote_client;
let client = timeline.remote_client.as_ref().unwrap();
// Download back the index.json, and check that the list of files is correct
let initial_index_part = match client
@@ -2167,7 +2127,7 @@ mod tests {
.layer_metadata
.keys()
.map(|f| f.to_owned())
.collect::<HashSet<LayerName>>();
.collect::<HashSet<LayerFileName>>();
let initial_layer = {
assert!(initial_layers.len() == 1);
initial_layers.into_iter().next().unwrap()
@@ -2193,7 +2153,7 @@ mod tests {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
]
.into_iter()
.map(|(name, contents): (LayerName, Vec<u8>)| {
.map(|(name, contents): (LayerFileName, Vec<u8>)| {
let local_path = local_layer_path(
harness.conf,
@@ -2274,9 +2234,9 @@ mod tests {
.map(|f| f.to_owned())
.collect(),
&[
&initial_layer.to_string(),
&layers[0].layer_desc().layer_name().to_string(),
&layers[1].layer_desc().layer_name().to_string(),
&initial_layer.file_name(),
&layers[0].layer_desc().filename().file_name(),
&layers[1].layer_desc().filename().file_name(),
],
);
assert_eq!(index_part.metadata, metadata);
@@ -2290,7 +2250,7 @@ mod tests {
// keep using schedule_layer_file_deletion because we don't have a way to wait for the
// spawn_blocking started by the drop.
client
.schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()])
.schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
.unwrap();
{
let mut guard = client.upload_queue.lock().unwrap();
@@ -2308,9 +2268,9 @@ mod tests {
}
assert_remote_files(
&[
&initial_layer.to_string(),
&layers[0].layer_desc().layer_name().to_string(),
&layers[1].layer_desc().layer_name().to_string(),
&initial_layer.file_name(),
&layers[0].layer_desc().filename().file_name(),
&layers[1].layer_desc().filename().file_name(),
"index_part.json",
],
&remote_timeline_dir,
@@ -2323,9 +2283,9 @@ mod tests {
assert_remote_files(
&[
&initial_layer.to_string(),
&layers[1].layer_desc().layer_name().to_string(),
&layers[2].layer_desc().layer_name().to_string(),
&initial_layer.file_name(),
&layers[1].layer_desc().filename().file_name(),
&layers[2].layer_desc().filename().file_name(),
"index_part.json",
],
&remote_timeline_dir,
@@ -2343,9 +2303,9 @@ mod tests {
timeline,
..
} = TestSetup::new("metrics").await.unwrap();
let client = &timeline.remote_client;
let client = timeline.remote_client.as_ref().unwrap();
let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let local_path = local_layer_path(
harness.conf,
&timeline.tenant_shard_id,

View File

@@ -21,7 +21,8 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::LayerName;
use crate::tenant::storage_layer::layer::local_layer_path;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::Generation;
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
use crate::TEMP_FILE_SUFFIX;
@@ -47,15 +48,21 @@ pub async fn download_layer_file<'a>(
storage: &'a GenericRemoteStorage,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
layer_file_name: &'a LayerName,
layer_file_name: &'a LayerFileName,
layer_metadata: &'a LayerFileMetadata,
local_path: &Utf8Path,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<u64, DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
let local_path = local_layer_path(
conf,
&tenant_shard_id,
&timeline_id,
layer_file_name,
&layer_metadata.generation,
);
let remote_path = remote_layer_path(
&tenant_shard_id.tenant_id,
@@ -75,7 +82,7 @@ pub async fn download_layer_file<'a>(
// For more context about durable_rename check this email from postgres mailing list:
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
let bytes_amount = download_retry(
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
@@ -105,17 +112,14 @@ pub async fn download_layer_file<'a>(
// We use fatal_err() below because the after the rename above,
// the in-memory state of the filesystem already has the layer file in its final place,
// and subsequent pageserver code could think it's durable while it really isn't.
let work = {
let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior());
async move {
let timeline_dir = VirtualFile::open(&timeline_path, &ctx)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
timeline_dir
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");
}
let work = async move {
let timeline_dir = VirtualFile::open(&timeline_path)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
timeline_dir
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");
};
crate::virtual_file::io_engine::get()
.spawn_blocking_and_block_on_if_std(work)
@@ -192,7 +196,7 @@ async fn download_object<'a>(
use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
use bytes::BytesMut;
async {
let destination_file = VirtualFile::create(dst_path, ctx)
let destination_file = VirtualFile::create(dst_path)
.await
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
.map_err(DownloadError::Other)?;

View File

@@ -5,12 +5,10 @@
use std::collections::HashMap;
use chrono::NaiveDateTime;
use pageserver_api::models::AuxFilePolicy;
use serde::{Deserialize, Serialize};
use utils::id::TimelineId;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::upload_queue::UploadQueueInitialized;
use crate::tenant::Generation;
use pageserver_api::shard::ShardIndex;
@@ -77,7 +75,7 @@ pub struct IndexPart {
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
/// that latest version stores.
pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated for convenience when reading the serialized structure, but is
@@ -86,19 +84,6 @@ pub struct IndexPart {
#[serde(rename = "metadata_bytes")]
pub metadata: TimelineMetadata,
#[serde(default)]
pub(crate) lineage: Lineage,
/// Describes the kind of aux files stored in the timeline.
///
/// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
/// A V1 setting after V2 files have been committed is not accepted.
///
/// None means no aux files have been written to the storage before the point
/// when this flag is introduced.
#[serde(skip_serializing_if = "Option::is_none", default)]
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
}
impl IndexPart {
@@ -111,21 +96,17 @@ impl IndexPart {
/// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
/// is always generated from the keys of `layer_metadata`)
/// - 4: timeline_layers is fully removed.
/// - 5: lineage was added
/// - 6: last_aux_file_policy is added.
const LATEST_VERSION: usize = 6;
const LATEST_VERSION: usize = 4;
// Versions we may see when reading from a bucket.
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];
pub const FILE_NAME: &'static str = "index_part.json";
fn new(
layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
layers_and_metadata: &HashMap<LayerFileName, LayerFileMetadata>,
disk_consistent_lsn: Lsn,
metadata: TimelineMetadata,
lineage: Lineage,
last_aux_file_policy: Option<AuxFilePolicy>,
) -> Self {
let layer_metadata = layers_and_metadata
.iter()
@@ -138,8 +119,6 @@ impl IndexPart {
disk_consistent_lsn,
metadata,
deleted_at: None,
lineage,
last_aux_file_policy,
}
}
@@ -168,29 +147,16 @@ impl IndexPart {
&HashMap::new(),
example_metadata.disk_consistent_lsn(),
example_metadata,
Default::default(),
Some(AuxFilePolicy::V1),
)
}
pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
self.last_aux_file_policy
}
}
impl From<&UploadQueueInitialized> for IndexPart {
fn from(uq: &UploadQueueInitialized) -> Self {
let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
let metadata = uq.latest_metadata.clone();
let lineage = uq.latest_lineage.clone();
Self::new(
&uq.latest_files,
disk_consistent_lsn,
metadata,
lineage,
uq.last_aux_file_policy,
)
Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
}
}
@@ -218,76 +184,8 @@ impl From<&LayerFileMetadata> for IndexLayerMetadata {
}
}
/// Limited history of earlier ancestors.
///
/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly
/// reparented by having an later timeline be detached from it's ancestor.
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
pub(crate) struct Lineage {
/// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`].
#[serde(skip_serializing_if = "is_false", default)]
reparenting_history_truncated: bool,
/// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`]
///
/// These are stored in case we want to support WAL based DR on the timeline. There can be many
/// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings
/// after [`Self::original_ancestor`] has been set.
#[serde(skip_serializing_if = "Vec::is_empty", default)]
reparenting_history: Vec<TimelineId>,
/// The ancestor from which this timeline has been detached from and when.
///
/// If you are adding support for detaching from a hierarchy, consider changing the ancestry
/// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
#[serde(skip_serializing_if = "Option::is_none", default)]
original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
}
fn is_false(b: &bool) -> bool {
!b
}
impl Lineage {
const REMEMBER_AT_MOST: usize = 100;
pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
if self.reparenting_history.last() == Some(old_ancestor) {
// do not re-record it
return;
}
let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
self.reparenting_history_truncated |= drop_oldest;
if drop_oldest {
self.reparenting_history.remove(0);
}
self.reparenting_history.push(*old_ancestor);
}
pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
assert!(self.original_ancestor.is_none());
self.original_ancestor =
Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
}
/// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
/// to start a read/write primary at this lsn".
///
/// Returns true if the Lsn was previously a branch point.
pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
self.original_ancestor
.as_ref()
.is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use super::*;
#[test]
@@ -323,8 +221,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -365,8 +261,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -408,9 +302,7 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
lineage: Lineage::default(),
last_aux_file_policy: None,
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -455,8 +347,6 @@ mod tests {
])
.unwrap(),
deleted_at: None,
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -495,110 +385,11 @@ mod tests {
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
lineage: Lineage::default(),
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
#[test]
fn v5_indexpart_is_parsed() {
let example = r#"{
"version":5,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1},
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}},
"disk_consistent_lsn":"0/15A7618",
"metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"lineage":{
"original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
"reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
}
}"#;
let expected = IndexPart {
version: 5,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
file_size: 23289856,
generation: Generation::new(1),
shard: ShardIndex::unsharded(),
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
file_size: 1015808,
generation: Generation::new(1),
shard: ShardIndex::unsharded(),
})
]),
disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
lineage: Lineage {
reparenting_history_truncated: false,
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
},
last_aux_file_policy: None,
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
#[test]
fn v6_indexpart_is_parsed() {
let example = r#"{
"version":6,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"deleted_at": "2023-07-31T09:00:00.123",
"lineage":{
"original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
"reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
},
"last_aux_file_policy": "V2"
}"#;
let expected = IndexPart {
version: 6,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
generation: Generation::none(),
shard: ShardIndex::unsharded()
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
generation: Generation::none(),
shard: ShardIndex::unsharded()
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
lineage: Lineage {
reparenting_history_truncated: false,
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
},
last_aux_file_policy: Some(AuxFilePolicy::V2),
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
fn parse_naive_datetime(s: &str) -> NaiveDateTime {
chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
}
}

View File

@@ -6,9 +6,11 @@ mod scheduler;
use std::{sync::Arc, time::SystemTime};
use crate::{
config::PageServerConf,
context::RequestContext,
disk_usage_eviction_task::DiskUsageEvictionInfo,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
virtual_file::MaybeFatalIo,
};
use self::{
@@ -19,8 +21,9 @@ use self::{
use super::{
config::{SecondaryLocationConfig, TenantConfOpt},
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
span::debug_assert_current_span_has_tenant_id,
storage_layer::LayerName,
storage_layer::{layer::local_layer_path, LayerFileName},
};
use pageserver_api::{
@@ -175,7 +178,13 @@ impl SecondaryTenant {
/// Cancellation safe, but on cancellation the eviction will go through
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
pub(crate) async fn evict_layer(
self: &Arc<Self>,
conf: &PageServerConf,
timeline_id: TimelineId,
name: LayerFileName,
metadata: LayerFileMetadata,
) {
debug_assert_current_span_has_tenant_id();
let guard = match self.gate.enter() {
@@ -188,11 +197,41 @@ impl SecondaryTenant {
let now = SystemTime::now();
let local_path = local_layer_path(
conf,
&self.tenant_shard_id,
&timeline_id,
&name,
&metadata.generation,
);
let this = self.clone();
// spawn it to be cancellation safe
tokio::task::spawn_blocking(move || {
let _guard = guard;
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
let deleted = std::fs::remove_file(local_path);
let not_found = deleted
.as_ref()
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
let deleted = if not_found {
false
} else {
deleted
.map(|()| true)
.fatal_err("Deleting layer during eviction")
};
if !deleted {
// skip updating accounting and putting perhaps later timestamp
return;
}
// Update the timeline's state. This does not have to be synchronized with
// the download process, because:
@@ -211,15 +250,8 @@ impl SecondaryTenant {
// of the cache.
let mut detail = this.detail.lock().unwrap();
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
let removed = timeline_detail.on_disk_layers.remove(&name);
// We might race with removal of the same layer during downloads, if it was removed
// from the heatmap. If we see that the OnDiskState is gone, then no need to
// do a physical deletion or store in evicted_at.
if let Some(removed) = removed {
removed.remove_blocking();
timeline_detail.evicted_at.insert(name, now);
}
timeline_detail.on_disk_layers.remove(&name);
timeline_detail.evicted_at.insert(name, now);
}
})
.await

View File

@@ -22,11 +22,11 @@ use crate::{
FAILED_REMOTE_OP_RETRIES,
},
span::debug_assert_current_span_has_tenant_id,
storage_layer::{layer::local_layer_path, LayerName},
storage_layer::{layer::local_layer_path, LayerFileName},
tasks::{warn_when_period_overrun, BackgroundLoopKind},
},
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
TEMP_FILE_SUFFIX,
METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
};
use super::{
@@ -45,10 +45,10 @@ use crate::tenant::{
use camino::Utf8PathBuf;
use chrono::format::{DelayedFormat, StrftimeItems};
use futures::{Future, StreamExt};
use futures::Future;
use pageserver_api::models::SecondaryProgress;
use pageserver_api::shard::TenantShardId;
use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, warn, Instrument};
@@ -71,12 +71,6 @@ use super::{
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
/// Range of concurrency we may use when downloading layers within a timeline. This is independent
/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
/// `PageServerConf::secondary_download_concurrency`
const MAX_LAYER_CONCURRENCY: usize = 16;
const MIN_LAYER_CONCURRENCY: usize = 1;
pub(super) async fn downloader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
@@ -85,15 +79,14 @@ pub(super) async fn downloader_task(
cancel: CancellationToken,
root_ctx: RequestContext,
) {
// How many tenants' secondary download operations we will run concurrently
let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
let generator = SecondaryDownloader {
tenant_manager,
remote_storage,
root_ctx,
};
let mut scheduler = Scheduler::new(generator, tenant_concurrency);
let mut scheduler = Scheduler::new(generator, concurrency);
scheduler
.run(command_queue, background_jobs_can_start, cancel)
@@ -111,7 +104,6 @@ struct SecondaryDownloader {
pub(super) struct OnDiskState {
metadata: LayerFileMetadata,
access_time: SystemTime,
local_path: Utf8PathBuf,
}
impl OnDiskState {
@@ -119,37 +111,23 @@ impl OnDiskState {
_conf: &'static PageServerConf,
_tenant_shard_id: &TenantShardId,
_imeline_id: &TimelineId,
_ame: LayerName,
_ame: LayerFileName,
metadata: LayerFileMetadata,
access_time: SystemTime,
local_path: Utf8PathBuf,
) -> Self {
Self {
metadata,
access_time,
local_path,
}
}
// This is infallible, because all errors are either acceptable (ENOENT), or totally
// unexpected (fatal).
pub(super) fn remove_blocking(&self) {
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
std::fs::remove_file(&self.local_path)
.or_else(fs_ext::ignore_not_found)
.fatal_err("Deleting secondary layer")
}
}
#[derive(Debug, Clone, Default)]
pub(super) struct SecondaryDetailTimeline {
pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
/// We remember when layers were evicted, to prevent re-downloading them.
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
}
/// This state is written by the secondary downloader, it is opaque
@@ -814,8 +792,6 @@ impl<'a> TenantDownloader<'a> {
tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
let mut download_futs = Vec::new();
// Download heatmap layers that are not present on local disk, or update their
// access time if they are already present.
for layer in timeline.layers {
@@ -831,12 +807,20 @@ impl<'a> TenantDownloader<'a> {
if cfg!(debug_assertions) {
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
// are already present on disk are really there.
match tokio::fs::metadata(&on_disk.local_path).await {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&layer.name,
&layer.metadata.generation,
);
match tokio::fs::metadata(&local_path).await {
Ok(meta) => {
tracing::debug!(
"Layer {} present at {}, size {}",
layer.name,
on_disk.local_path,
local_path,
meta.len(),
);
}
@@ -844,7 +828,7 @@ impl<'a> TenantDownloader<'a> {
tracing::warn!(
"Layer {} not found at {} ({})",
layer.name,
on_disk.local_path,
local_path,
e
);
debug_assert!(false);
@@ -890,33 +874,67 @@ impl<'a> TenantDownloader<'a> {
}
}
download_futs.push(self.download_layer(
tenant_shard_id,
&timeline.timeline_id,
layer,
// Failpoint for simulating slow remote storage
failpoint_support::sleep_millis_async!(
"secondary-layer-download-sleep",
&self.secondary_state.cancel
);
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
let downloaded_bytes = match download_layer_file(
self.conf,
self.remote_storage,
*tenant_shard_id,
timeline.timeline_id,
&layer.name,
&LayerFileMetadata::from(&layer.metadata),
&self.secondary_state.cancel,
ctx,
));
}
// Break up layer downloads into chunks, so that for each chunk we can re-check how much
// concurrency to use based on activity level of remote storage.
while !download_futs.is_empty() {
let chunk =
download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
let concurrency = Self::layer_concurrency(self.remote_storage.activity());
let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
let mut result_stream = std::pin::pin!(result_stream);
while let Some(result) = result_stream.next().await {
match result {
Err(e) => return Err(e),
Ok(None) => {
// No error, but we didn't download the layer. Don't mark it touched
}
Ok(Some(layer)) => touched.push(layer),
)
.await
{
Ok(bytes) => bytes,
Err(DownloadError::NotFound) => {
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
// This is harmless: continue to download the next layer. It is expected during compaction
// GC.
tracing::debug!(
"Skipped downloading missing layer {}, raced with compaction/gc?",
layer.name
);
continue;
}
Err(e) => return Err(e.into()),
};
if downloaded_bytes != layer.metadata.file_size {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&layer.name,
&layer.metadata.generation,
);
tracing::warn!(
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
layer.name,
downloaded_bytes,
layer.metadata.file_size
);
tokio::fs::remove_file(&local_path)
.await
.or_else(fs_ext::ignore_not_found)?;
} else {
tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
let mut progress = self.secondary_state.progress.lock().unwrap();
progress.bytes_downloaded += downloaded_bytes;
progress.layers_downloaded += 1;
}
SECONDARY_MODE.download_layer.inc();
touched.push(layer)
}
// Write updates to state to record layers we just downloaded or touched.
@@ -933,13 +951,6 @@ impl<'a> TenantDownloader<'a> {
v.get_mut().access_time = t.access_time;
}
Entry::Vacant(e) => {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&t.name,
&t.metadata.generation,
);
e.insert(OnDiskState::new(
self.conf,
tenant_shard_id,
@@ -947,7 +958,6 @@ impl<'a> TenantDownloader<'a> {
t.name,
LayerFileMetadata::from(&t.metadata),
t.access_time,
local_path,
));
}
}
@@ -956,99 +966,6 @@ impl<'a> TenantDownloader<'a> {
Ok(())
}
async fn download_layer(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
layer: HeatMapLayer,
ctx: &RequestContext,
) -> Result<Option<HeatMapLayer>, UpdateError> {
// Failpoint for simulating slow remote storage
failpoint_support::sleep_millis_async!(
"secondary-layer-download-sleep",
&self.secondary_state.cancel
);
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
timeline_id,
&layer.name,
&layer.metadata.generation,
);
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
let downloaded_bytes = match download_layer_file(
self.conf,
self.remote_storage,
*tenant_shard_id,
*timeline_id,
&layer.name,
&LayerFileMetadata::from(&layer.metadata),
&local_path,
&self.secondary_state.cancel,
ctx,
)
.await
{
Ok(bytes) => bytes,
Err(DownloadError::NotFound) => {
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
// This is harmless: continue to download the next layer. It is expected during compaction
// GC.
tracing::debug!(
"Skipped downloading missing layer {}, raced with compaction/gc?",
layer.name
);
return Ok(None);
}
Err(e) => return Err(e.into()),
};
if downloaded_bytes != layer.metadata.file_size {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
timeline_id,
&layer.name,
&layer.metadata.generation,
);
tracing::warn!(
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
layer.name,
downloaded_bytes,
layer.metadata.file_size
);
tokio::fs::remove_file(&local_path)
.await
.or_else(fs_ext::ignore_not_found)?;
} else {
tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
let mut progress = self.secondary_state.progress.lock().unwrap();
progress.bytes_downloaded += downloaded_bytes;
progress.layers_downloaded += 1;
}
SECONDARY_MODE.download_layer.inc();
Ok(Some(layer))
}
/// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
// When less than 75% of units are available, use minimum concurrency. Else, do a linear mapping
// of our concurrency range to the units available within the remaining 25%.
let clamp_at = (activity.read_total * 3) / 4;
if activity.read_available > clamp_at {
(MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
/ (activity.read_total - clamp_at)
} else {
MIN_LAYER_CONCURRENCY
}
}
}
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1080,7 +997,7 @@ async fn init_timeline_state(
// As we iterate through layers found on disk, we will look up their metadata from this map.
// Layers not present in metadata will be discarded.
let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
while let Some(dentry) = dir
@@ -1098,7 +1015,11 @@ async fn init_timeline_state(
.fatal_err(&format!("Read metadata on {}", file_path));
let file_name = file_path.file_name().expect("created it from the dentry");
if crate::is_temporary(&file_path)
if file_name == METADATA_FILE_NAME {
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
continue;
} else if crate::is_temporary(&file_path)
|| is_temp_download_file(&file_path)
|| is_ephemeral_file(file_name)
{
@@ -1113,7 +1034,7 @@ async fn init_timeline_state(
continue;
}
match LayerName::from_str(file_name) {
match LayerFileName::from_str(file_name) {
Ok(name) => {
let remote_meta = heatmap_metadata.get(&name);
match remote_meta {
@@ -1140,7 +1061,6 @@ async fn init_timeline_state(
name,
LayerFileMetadata::from(&remote_meta.metadata),
remote_meta.access_time,
file_path,
),
);
}
@@ -1172,58 +1092,3 @@ async fn init_timeline_state(
detail
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn layer_concurrency() {
// Totally idle
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 16,
read_total: 16,
write_available: 16,
write_total: 16
}),
MAX_LAYER_CONCURRENCY
);
// Totally busy
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 0,
read_total: 16,
write_available: 16,
write_total: 16
}),
MIN_LAYER_CONCURRENCY
);
// Edge of the range at which we interpolate
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 12,
read_total: 16,
write_available: 16,
write_total: 16
}),
MIN_LAYER_CONCURRENCY
);
// Midpoint of the range in which we interpolate
assert_eq!(
TenantDownloader::layer_concurrency(RemoteStorageActivity {
read_available: 14,
read_total: 16,
write_available: 16,
write_total: 16
}),
MAX_LAYER_CONCURRENCY / 2
);
}
}

View File

@@ -1,6 +1,8 @@
use std::time::SystemTime;
use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
use crate::tenant::{
remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -15,14 +17,6 @@ pub(super) struct HeatMapTenant {
pub(super) generation: Generation,
pub(super) timelines: Vec<HeatMapTimeline>,
/// Uploaders provide their own upload period in the heatmap, as a hint to downloaders
/// of how frequently it is worthwhile to check for updates.
///
/// This is optional for backward compat, and because we sometimes might upload
/// a heatmap explicitly via API for a tenant that has no periodic upload configured.
#[serde(default)]
pub(super) upload_period_ms: Option<u128>,
}
#[serde_as]
@@ -37,7 +31,7 @@ pub(crate) struct HeatMapTimeline {
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(super) name: LayerName,
pub(super) name: LayerFileName,
pub(super) metadata: IndexLayerMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
@@ -48,7 +42,7 @@ pub(crate) struct HeatMapLayer {
impl HeatMapLayer {
pub(crate) fn new(
name: LayerName,
name: LayerFileName,
metadata: IndexLayerMetadata,
access_time: SystemTime,
) -> Self {
@@ -89,21 +83,4 @@ impl HeatMapTenant {
stats
}
pub(crate) fn strip_atimes(self) -> Self {
Self {
timelines: self
.timelines
.into_iter()
.map(|mut tl| {
for layer in &mut tl.layers {
layer.access_time = SystemTime::UNIX_EPOCH;
}
tl
})
.collect(),
generation: self.generation,
upload_period_ms: self.upload_period_ms,
}
}
}

View File

@@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress {
struct UploadPending {
tenant: Arc<Tenant>,
last_upload: Option<LastUploadState>,
last_digest: Option<md5::Digest>,
target_time: Option<Instant>,
period: Option<Duration>,
}
@@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending {
struct WriteComplete {
tenant_shard_id: TenantShardId,
completed_at: Instant,
uploaded: Option<LastUploadState>,
digest: Option<md5::Digest>,
next_upload: Option<Instant>,
}
@@ -115,7 +115,10 @@ struct UploaderTenantState {
tenant: Weak<Tenant>,
/// Digest of the serialized heatmap that we last successfully uploaded
last_upload_state: Option<LastUploadState>,
///
/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag,
/// which is also an md5sum.
last_digest: Option<md5::Digest>,
/// When the last upload attempt completed (may have been successful or failed)
last_upload: Option<Instant>,
@@ -184,7 +187,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
last_upload_state: None,
last_digest: None,
});
// Decline to do the upload if insufficient time has passed
@@ -192,10 +195,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
return;
}
let last_upload = state.last_upload_state.clone();
let last_digest = state.last_digest;
result.jobs.push(UploadPending {
tenant,
last_upload,
last_digest,
target_time: state.next_upload,
period: Some(period),
});
@@ -215,7 +218,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
) {
let UploadPending {
tenant,
last_upload,
last_digest,
target_time,
period,
} = job;
@@ -228,16 +231,16 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
let _completion = completion;
let started_at = Instant::now();
let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap.inc();
Some(uploaded)
Some(digest)
}
Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
Err(UploadHeatmapError::Upload(e)) => {
tracing::warn!(
"Failed to upload heatmap for tenant {}: {e:#}",
@@ -248,11 +251,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap_errors.inc();
last_upload
last_digest
}
Err(UploadHeatmapError::Cancelled) => {
tracing::info!("Cancelled heatmap upload, shutting down");
last_upload
last_digest
}
};
@@ -274,7 +277,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
WriteComplete {
tenant_shard_id: *tenant.get_tenant_shard_id(),
completed_at: now,
uploaded,
digest,
next_upload,
}
}.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
@@ -296,7 +299,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
Ok(UploadPending {
// Ignore our state for last digest: this forces an upload even if nothing has changed
last_upload: None,
last_digest: None,
tenant,
target_time: None,
period: None,
@@ -309,7 +312,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
let WriteComplete {
tenant_shard_id,
completed_at,
uploaded,
digest,
next_upload,
} = completion;
use std::collections::hash_map::Entry;
@@ -319,7 +322,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
}
Entry::Occupied(mut entry) => {
entry.get_mut().last_upload = Some(completed_at);
entry.get_mut().last_upload_state = uploaded;
entry.get_mut().last_digest = digest;
entry.get_mut().next_upload = next_upload
}
}
@@ -328,7 +331,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
enum UploadHeatmapOutcome {
/// We successfully wrote to remote storage, with this digest.
Uploaded(LastUploadState),
Uploaded(md5::Digest),
/// We did not upload because the heatmap digest was unchanged since the last upload
NoChange,
/// We skipped the upload for some reason, such as tenant/timeline not ready
@@ -344,25 +347,12 @@ enum UploadHeatmapError {
Upload(#[from] anyhow::Error),
}
/// Digests describing the heatmap we most recently uploaded successfully.
///
/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag,
/// which is also an md5sum.
#[derive(Clone)]
struct LastUploadState {
// Digest of json-encoded HeatMapTenant
uploaded_digest: md5::Digest,
// Digest without atimes set.
layers_only_digest: md5::Digest,
}
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
/// of the object we would have uploaded.
async fn upload_tenant_heatmap(
remote_storage: GenericRemoteStorage,
tenant: &Arc<Tenant>,
last_upload: Option<LastUploadState>,
last_digest: Option<md5::Digest>,
) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
debug_assert_current_span_has_tenant_id();
@@ -378,7 +368,6 @@ async fn upload_tenant_heatmap(
let mut heatmap = HeatMapTenant {
timelines: Vec::new(),
generation,
upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()),
};
let timelines = tenant.timelines.lock().unwrap().clone();
@@ -407,31 +396,15 @@ async fn upload_tenant_heatmap(
// Serialize the heatmap
let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
let bytes = bytes::Bytes::from(bytes);
let size = bytes.len();
// Drop out early if nothing changed since our last upload
let digest = md5::compute(&bytes);
if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) {
if Some(digest) == last_digest {
return Ok(UploadHeatmapOutcome::NoChange);
}
// Calculate a digest that omits atimes, so that we can distinguish actual changes in
// layers from changes only in atimes.
let heatmap_size_bytes = heatmap.get_stats().bytes;
let layers_only_bytes =
serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?;
let layers_only_digest = md5::compute(&layers_only_bytes);
if heatmap_size_bytes < tenant.get_checkpoint_distance() {
// For small tenants, skip upload if only atimes changed. This avoids doing frequent
// uploads from long-idle tenants whose atimes are just incremented by periodic
// size calculations.
if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) {
return Ok(UploadHeatmapOutcome::NoChange);
}
}
let bytes = bytes::Bytes::from(bytes);
let size = bytes.len();
let path = remote_heatmap_path(tenant.get_tenant_shard_id());
let cancel = &tenant.cancel;
@@ -463,8 +436,5 @@ async fn upload_tenant_heatmap(
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
uploaded_digest: digest,
layers_only_digest,
}))
Ok(UploadHeatmapOutcome::Uploaded(digest))
}

View File

@@ -1,11 +1,11 @@
//! Common traits and structs for layers
pub mod delta_layer;
mod filename;
pub mod image_layer;
pub(crate) mod inmemory_layer;
pub(crate) mod layer;
mod layer_desc;
mod layer_name;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
@@ -34,10 +34,10 @@ use utils::rate_limit::RateLimit;
use utils::{id::TimelineId, lsn::Lsn};
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
pub use image_layer::{ImageLayer, ImageLayerWriter};
pub use inmemory_layer::InMemoryLayer;
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
@@ -646,8 +646,8 @@ pub mod tests {
use super::*;
impl From<DeltaLayerName> for PersistentLayerDesc {
fn from(value: DeltaLayerName) -> Self {
impl From<DeltaFileName> for PersistentLayerDesc {
fn from(value: DeltaFileName) -> Self {
PersistentLayerDesc::new_delta(
TenantShardId::from([0; 18]),
TimelineId::from_array([0; 16]),
@@ -658,8 +658,8 @@ pub mod tests {
}
}
impl From<ImageLayerName> for PersistentLayerDesc {
fn from(value: ImageLayerName) -> Self {
impl From<ImageFileName> for PersistentLayerDesc {
fn from(value: ImageFileName) -> Self {
PersistentLayerDesc::new_img(
TenantShardId::from([0; 18]),
TimelineId::from_array([0; 16]),
@@ -670,11 +670,11 @@ pub mod tests {
}
}
impl From<LayerName> for PersistentLayerDesc {
fn from(value: LayerName) -> Self {
impl From<LayerFileName> for PersistentLayerDesc {
fn from(value: LayerFileName) -> Self {
match value {
LayerName::Delta(d) => Self::from(d),
LayerName::Image(i) => Self::from(i),
LayerFileName::Delta(d) => Self::from(d),
LayerFileName::Image(i) => Self::from(i),
}
}
}

View File

@@ -57,7 +57,6 @@ use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::OnceCell;
use tracing::*;
@@ -69,8 +68,7 @@ use utils::{
};
use super::{
AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
ValuesReconstructState,
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
};
///
@@ -311,13 +309,13 @@ impl DeltaLayer {
.and_then(|res| res)?;
// not production code
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
let expected_layer_name = self.layer_desc().layer_name();
let actual_filename = path.file_name().unwrap().to_owned();
let expected_filename = self.layer_desc().filename().file_name();
if actual_layer_name != expected_layer_name {
if actual_filename != expected_filename {
println!("warning: filename does not match what is expected from in-file summary");
println!("actual: {:?}", actual_layer_name.to_string());
println!("expected: {:?}", expected_layer_name.to_string());
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
}
Ok(Arc::new(loaded))
@@ -394,7 +392,6 @@ impl DeltaLayerWriterInner {
tenant_shard_id: TenantShardId,
key_start: Key,
lsn_range: Range<Lsn>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
// Create the file initially with a temporary filename. We don't know
// the end key yet, so we cannot form the final filename yet. We will
@@ -405,7 +402,7 @@ impl DeltaLayerWriterInner {
let path =
DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);
let mut file = VirtualFile::create(&path, ctx).await?;
let mut file = VirtualFile::create(&path).await?;
// make room for the header block
file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
@@ -587,7 +584,6 @@ impl DeltaLayerWriter {
tenant_shard_id: TenantShardId,
key_start: Key,
lsn_range: Range<Lsn>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
Ok(Self {
inner: Some(
@@ -597,7 +593,6 @@ impl DeltaLayerWriter {
tenant_shard_id,
key_start,
lsn_range,
ctx,
)
.await?,
),
@@ -704,7 +699,6 @@ impl DeltaLayer {
let mut file = VirtualFile::open_with_options(
path,
virtual_file::OpenOptions::new().read(true).write(true),
ctx,
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
@@ -738,7 +732,7 @@ impl DeltaLayerInner {
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
ctx: &RequestContext,
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path, ctx).await {
let file = match VirtualFile::open(path).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
@@ -912,7 +906,7 @@ impl DeltaLayerInner {
.await
.map_err(GetVectoredError::Other)?;
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
self.do_reads_and_update_state(reads, reconstruct_state)
.await;
reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
@@ -1016,7 +1010,6 @@ impl DeltaLayerInner {
&self,
reads: Vec<VectoredRead>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) {
let vectored_blob_reader = VectoredBlobReader::new(&self.file);
let mut ignore_key_with_err = None;
@@ -1034,7 +1027,7 @@ impl DeltaLayerInner {
// track when a key is done.
for read in reads.into_iter().rev() {
let res = vectored_blob_reader
.read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
.read_blobs(&read, buf.take().expect("Should have a buffer"))
.await;
let blobs_buf = match res {
@@ -1279,7 +1272,7 @@ impl DeltaLayerInner {
buf.clear();
buf.reserve(read.size());
let res = reader.read_blobs(&read, buf, ctx).await?;
let res = reader.read_blobs(&read, buf).await?;
for blob in res.blobs {
let key = blob.meta.key;
@@ -1796,7 +1789,6 @@ mod test {
harness.tenant_shard_id,
entries_meta.key_range.start,
entries_meta.lsn_range.clone(),
&ctx,
)
.await?;
@@ -1854,7 +1846,7 @@ mod test {
for read in vectored_reads {
let blobs_buf = vectored_blob_reader
.read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
.read_blobs(&read, buf.take().expect("Should have a buffer"))
.await?;
for meta in blobs_buf.blobs.iter() {
let value = &blobs_buf.buf[meta.start..meta.end];
@@ -1984,7 +1976,6 @@ mod test {
tenant.tenant_shard_id,
Key::MIN,
Lsn(0x11)..truncate_at,
ctx,
)
.await
.unwrap();

View File

@@ -15,29 +15,29 @@ use super::PersistentLayerDesc;
// Note: Timeline::load_layer_map() relies on this sort order
#[derive(PartialEq, Eq, Clone, Hash)]
pub struct DeltaLayerName {
pub struct DeltaFileName {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
}
impl std::fmt::Debug for DeltaLayerName {
impl std::fmt::Debug for DeltaFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use super::RangeDisplayDebug;
f.debug_struct("DeltaLayerName")
f.debug_struct("DeltaFileName")
.field("key_range", &RangeDisplayDebug(&self.key_range))
.field("lsn_range", &self.lsn_range)
.finish()
}
}
impl PartialOrd for DeltaLayerName {
impl PartialOrd for DeltaFileName {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for DeltaLayerName {
impl Ord for DeltaFileName {
fn cmp(&self, other: &Self) -> Ordering {
let mut cmp = self.key_range.start.cmp(&other.key_range.start);
if cmp != Ordering::Equal {
@@ -57,14 +57,16 @@ impl Ord for DeltaLayerName {
}
}
/// Represents the region of the LSN-Key space covered by a DeltaLayer
/// Represents the filename of a DeltaLayer
///
/// ```text
/// <key start>-<key end>__<LSN start>-<LSN end>
/// ```
impl DeltaLayerName {
/// Parse the part of a delta layer's file name that represents the LayerName. Returns None
/// if the filename does not match the expected pattern.
impl DeltaFileName {
///
/// Parse a string as a delta file name. Returns None if the filename does not
/// match the expected pattern.
///
pub fn parse_str(fname: &str) -> Option<Self> {
let mut parts = fname.split("__");
let mut key_parts = parts.next()?.split('-');
@@ -103,14 +105,14 @@ impl DeltaLayerName {
// or panic?
}
Some(DeltaLayerName {
Some(DeltaFileName {
key_range: key_start..key_end,
lsn_range: start_lsn..end_lsn,
})
}
}
impl fmt::Display for DeltaLayerName {
impl fmt::Display for DeltaFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
@@ -124,29 +126,29 @@ impl fmt::Display for DeltaLayerName {
}
#[derive(PartialEq, Eq, Clone, Hash)]
pub struct ImageLayerName {
pub struct ImageFileName {
pub key_range: Range<Key>,
pub lsn: Lsn,
}
impl std::fmt::Debug for ImageLayerName {
impl std::fmt::Debug for ImageFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use super::RangeDisplayDebug;
f.debug_struct("ImageLayerName")
f.debug_struct("ImageFileName")
.field("key_range", &RangeDisplayDebug(&self.key_range))
.field("lsn", &self.lsn)
.finish()
}
}
impl PartialOrd for ImageLayerName {
impl PartialOrd for ImageFileName {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ImageLayerName {
impl Ord for ImageFileName {
fn cmp(&self, other: &Self) -> Ordering {
let mut cmp = self.key_range.start.cmp(&other.key_range.start);
if cmp != Ordering::Equal {
@@ -162,7 +164,7 @@ impl Ord for ImageLayerName {
}
}
impl ImageLayerName {
impl ImageFileName {
pub fn lsn_as_range(&self) -> Range<Lsn> {
// Saves from having to copypaste this all over
PersistentLayerDesc::image_layer_lsn_range(self.lsn)
@@ -170,14 +172,16 @@ impl ImageLayerName {
}
///
/// Represents the part of the Key-LSN space covered by an ImageLayer
/// Represents the filename of an ImageLayer
///
/// ```text
/// <key start>-<key end>__<LSN>
/// ```
impl ImageLayerName {
/// Parse a string as then LayerName part of an image layer file name. Returns None if the
/// filename does not match the expected pattern.
impl ImageFileName {
///
/// Parse a string as an image file name. Returns None if the filename does not
/// match the expected pattern.
///
pub fn parse_str(fname: &str) -> Option<Self> {
let mut parts = fname.split("__");
let mut key_parts = parts.next()?.split('-');
@@ -198,14 +202,14 @@ impl ImageLayerName {
let lsn = Lsn::from_hex(lsn_str).ok()?;
Some(ImageLayerName {
Some(ImageFileName {
key_range: key_start..key_end,
lsn,
})
}
}
impl fmt::Display for ImageLayerName {
impl fmt::Display for ImageFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
@@ -216,24 +220,21 @@ impl fmt::Display for ImageLayerName {
)
}
}
/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The
/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
/// over time (e.g. across shard splits or compression). The physical filenames of layers in local
/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
/// and [`crate::tenant::storage_layer::layer::local_layer_path`])
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub enum LayerName {
Image(ImageLayerName),
Delta(DeltaLayerName),
pub enum LayerFileName {
Image(ImageFileName),
Delta(DeltaFileName),
}
impl LayerName {
impl LayerFileName {
pub fn file_name(&self) -> String {
self.to_string()
}
/// Determines if this layer file is considered to be in future meaning we will discard these
/// layers during timeline initialization from the given disk_consistent_lsn.
pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
use LayerName::*;
use LayerFileName::*;
match self {
Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
@@ -242,7 +243,7 @@ impl LayerName {
}
pub(crate) fn kind(&self) -> &'static str {
use LayerName::*;
use LayerFileName::*;
match self {
Delta(_) => "delta",
Image(_) => "image",
@@ -250,7 +251,7 @@ impl LayerName {
}
}
impl fmt::Display for LayerName {
impl fmt::Display for LayerFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Image(fname) => write!(f, "{fname}"),
@@ -259,25 +260,25 @@ impl fmt::Display for LayerName {
}
}
impl From<ImageLayerName> for LayerName {
fn from(fname: ImageLayerName) -> Self {
impl From<ImageFileName> for LayerFileName {
fn from(fname: ImageFileName) -> Self {
Self::Image(fname)
}
}
impl From<DeltaLayerName> for LayerName {
fn from(fname: DeltaLayerName) -> Self {
impl From<DeltaFileName> for LayerFileName {
fn from(fname: DeltaFileName) -> Self {
Self::Delta(fname)
}
}
impl FromStr for LayerName {
impl FromStr for LayerFileName {
type Err = String;
/// Conversion from either a physical layer filename, or the string-ization of
/// Self. When loading a physical layer filename, we drop any extra information
/// not needed to build Self.
fn from_str(value: &str) -> Result<Self, Self::Err> {
let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
let gen_suffix_regex = Regex::new("^(?<base>.+)-(?<gen>[0-9a-f]{8})$").unwrap();
let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
Some(captures) => captures
.name("base")
@@ -287,8 +288,8 @@ impl FromStr for LayerName {
None => value.into(),
};
let delta = DeltaLayerName::parse_str(&file_name);
let image = ImageLayerName::parse_str(&file_name);
let delta = DeltaFileName::parse_str(&file_name);
let image = ImageFileName::parse_str(&file_name);
let ok = match (delta, image) {
(None, None) => {
return Err(format!(
@@ -303,7 +304,7 @@ impl FromStr for LayerName {
}
}
impl serde::Serialize for LayerName {
impl serde::Serialize for LayerFileName {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
@@ -315,19 +316,19 @@ impl serde::Serialize for LayerName {
}
}
impl<'de> serde::Deserialize<'de> for LayerName {
impl<'de> serde::Deserialize<'de> for LayerFileName {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
deserializer.deserialize_string(LayerNameVisitor)
deserializer.deserialize_string(LayerFileNameVisitor)
}
}
struct LayerNameVisitor;
struct LayerFileNameVisitor;
impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
type Value = LayerName;
impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
type Value = LayerFileName;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(
@@ -348,16 +349,16 @@ mod test {
use super::*;
#[test]
fn image_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Image(ImageLayerName {
let expected = LayerFileName::Image(ImageFileName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
Ok(())
@@ -365,17 +366,17 @@ mod test {
#[test]
fn delta_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Delta(DeltaLayerName {
let expected = LayerFileName::Delta(DeltaFileName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
..Lsn::from_hex("000000000154C481").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
Ok(())

View File

@@ -54,7 +54,6 @@ use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::OnceCell;
use tokio_stream::StreamExt;
@@ -66,10 +65,8 @@ use utils::{
lsn::Lsn,
};
use super::layer_name::ImageLayerName;
use super::{
AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
};
use super::filename::ImageFileName;
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
///
/// Header stored in the beginning of the file
@@ -234,7 +231,7 @@ impl ImageLayer {
conf: &PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
fname: &ImageLayerName,
fname: &ImageFileName,
) -> Utf8PathBuf {
let rand_string: String = rand::thread_rng()
.sample_iter(&Alphanumeric)
@@ -270,13 +267,13 @@ impl ImageLayer {
.and_then(|res| res)?;
// not production code
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
let expected_layer_name = self.layer_desc().layer_name();
let actual_filename = path.file_name().unwrap().to_owned();
let expected_filename = self.layer_desc().filename().file_name();
if actual_layer_name != expected_layer_name {
if actual_filename != expected_filename {
println!("warning: filename does not match what is expected from in-file summary");
println!("actual: {:?}", actual_layer_name.to_string());
println!("expected: {:?}", expected_layer_name.to_string());
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
}
Ok(loaded)
@@ -343,7 +340,6 @@ impl ImageLayer {
let mut file = VirtualFile::open_with_options(
path,
virtual_file::OpenOptions::new().read(true).write(true),
ctx,
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
@@ -378,7 +374,7 @@ impl ImageLayerInner {
max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
ctx: &RequestContext,
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
let file = match VirtualFile::open(path, ctx).await {
let file = match VirtualFile::open(path).await {
Ok(file) => file,
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
};
@@ -475,7 +471,7 @@ impl ImageLayerInner {
.await
.map_err(GetVectoredError::Other)?;
self.do_reads_and_update_state(reads, reconstruct_state, ctx)
self.do_reads_and_update_state(reads, reconstruct_state)
.await;
Ok(())
@@ -538,7 +534,6 @@ impl ImageLayerInner {
&self,
reads: Vec<VectoredRead>,
reconstruct_state: &mut ValuesReconstructState,
ctx: &RequestContext,
) {
let max_vectored_read_bytes = self
.max_vectored_read_bytes
@@ -567,7 +562,7 @@ impl ImageLayerInner {
}
let buf = BytesMut::with_capacity(buf_size);
let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
let res = vectored_blob_reader.read_blobs(&read, buf).await;
match res {
Ok(blobs_buf) => {
@@ -633,7 +628,6 @@ impl ImageLayerWriterInner {
tenant_shard_id: TenantShardId,
key_range: &Range<Key>,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
// Create the file initially with a temporary filename.
// We'll atomically rename it to the final name when we're done.
@@ -641,7 +635,7 @@ impl ImageLayerWriterInner {
conf,
timeline_id,
tenant_shard_id,
&ImageLayerName {
&ImageFileName {
key_range: key_range.clone(),
lsn,
},
@@ -653,7 +647,6 @@ impl ImageLayerWriterInner {
virtual_file::OpenOptions::new()
.write(true)
.create_new(true),
ctx,
)
.await?
};
@@ -808,11 +801,10 @@ impl ImageLayerWriter {
tenant_shard_id: TenantShardId,
key_range: &Range<Key>,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<ImageLayerWriter> {
Ok(Self {
inner: Some(
ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
.await?,
),
})

View File

@@ -473,11 +473,10 @@ impl InMemoryLayer {
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
start_lsn: Lsn,
ctx: &RequestContext,
) -> Result<InMemoryLayer> {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
let key = InMemoryLayerFileId(file.page_cache_file_id());
Ok(InMemoryLayer {
@@ -643,7 +642,6 @@ impl InMemoryLayer {
self.tenant_shard_id,
Key::MIN,
self.start_lsn..end_lsn,
ctx,
)
.await?;

View File

@@ -25,7 +25,7 @@ use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
use super::delta_layer::{self, DeltaEntry};
use super::image_layer;
use super::{
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
};
@@ -128,17 +128,19 @@ pub(crate) fn local_layer_path(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
layer_file_name: &LayerName,
generation: &Generation,
layer_file_name: &LayerFileName,
_generation: &Generation,
) -> Utf8PathBuf {
let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
if generation.is_none() {
// Without a generation, we may only use legacy path style
timeline_path.join(layer_file_name.to_string())
} else {
timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
}
timeline_path.join(layer_file_name.file_name())
// TOOD: include generation in the name in now+1 releases.
// timeline_path.join(format!(
// "{}{}",
// layer_file_name.file_name(),
// generation.get_suffix()
// ))
}
impl Layer {
@@ -146,7 +148,7 @@ impl Layer {
pub(crate) fn for_evicted(
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
file_name: LayerName,
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> Self {
let local_path = local_layer_path(
@@ -187,7 +189,7 @@ impl Layer {
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
local_path: Utf8PathBuf,
file_name: LayerName,
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> ResidentLayer {
let desc = PersistentLayerDesc::from_filename(
@@ -259,7 +261,7 @@ impl Layer {
conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&desc.layer_name(),
&desc.filename(),
&timeline.generation,
);
@@ -585,6 +587,9 @@ struct LayerInner {
/// [`Timeline::gate`] at the same time.
timeline: Weak<Timeline>,
/// Cached knowledge of [`Timeline::remote_client`] being `Some`.
have_remote_client: bool,
access_stats: LayerAccessStats,
/// This custom OnceCell is backed by std mutex, but only held for short time periods.
@@ -684,7 +689,7 @@ impl Drop for LayerInner {
let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
let path = std::mem::take(&mut self.path);
let file_name = self.layer_desc().layer_name();
let file_name = self.layer_desc().filename();
let file_size = self.layer_desc().file_size;
let timeline = self.timeline.clone();
let meta = self.metadata();
@@ -729,23 +734,23 @@ impl Drop for LayerInner {
if removed {
timeline.metrics.resident_physical_size_sub(file_size);
}
let res = timeline
.remote_client
.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
// demonstrating this deadlock (without spawn_blocking): stop will drop
// queued items, which will have ResidentLayer's, and those drops would try
// to re-entrantly lock the RemoteTimelineClient inner state.
if !timeline.is_active() {
tracing::info!("scheduling deletion on drop failed: {e:#}");
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
// demonstrating this deadlock (without spawn_blocking): stop will drop
// queued items, which will have ResidentLayer's, and those drops would try
// to re-entrantly lock the RemoteTimelineClient inner state.
if !timeline.is_active() {
tracing::info!("scheduling deletion on drop failed: {e:#}");
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
}
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
LAYER_IMPL_METRICS.inc_completed_deletes();
}
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
} else {
LAYER_IMPL_METRICS.inc_completed_deletes();
}
});
}
@@ -777,12 +782,11 @@ impl LayerInner {
LayerInner {
conf,
debug_str: {
format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
},
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
path: local_path,
desc,
timeline: Arc::downgrade(timeline),
have_remote_client: timeline.remote_client.is_some(),
access_stats,
wanted_deleted: AtomicBool::new(false),
inner,
@@ -811,6 +815,8 @@ impl LayerInner {
/// in a new attempt to evict OR join the previously started attempt.
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
assert!(self.have_remote_client);
let mut rx = self.status.as_ref().unwrap().subscribe();
{
@@ -967,6 +973,10 @@ impl LayerInner {
return Err(DownloadError::NotFile(ft));
}
if timeline.remote_client.as_ref().is_none() {
return Err(DownloadError::NoRemoteStorage);
}
if let Some(ctx) = ctx {
self.check_expected_download(ctx)?;
}
@@ -1103,12 +1113,15 @@ impl LayerInner {
permit: heavier_once_cell::InitPermit,
ctx: &RequestContext,
) -> anyhow::Result<Arc<DownloadedLayer>> {
let result = timeline
let client = timeline
.remote_client
.as_ref()
.expect("checked before download_init_and_wait");
let result = client
.download_layer_file(
&self.desc.layer_name(),
&self.desc.filename(),
&self.metadata(),
&self.path,
&timeline.cancel,
ctx,
)
@@ -1244,7 +1257,7 @@ impl LayerInner {
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_name = self.desc.layer_name().to_string();
let layer_file_name = self.desc.filename().file_name();
let resident = self
.inner
@@ -1258,7 +1271,7 @@ impl LayerInner {
let lsn_range = &self.desc.lsn_range;
HistoricLayerInfo::Delta {
layer_file_name: layer_name,
layer_file_name,
layer_file_size: self.desc.file_size,
lsn_start: lsn_range.start,
lsn_end: lsn_range.end,
@@ -1269,7 +1282,7 @@ impl LayerInner {
let lsn = self.desc.image_layer_lsn();
HistoricLayerInfo::Image {
layer_file_name: layer_name,
layer_file_name,
layer_file_size: self.desc.file_size,
lsn_start: lsn,
remote: !resident,
@@ -1280,10 +1293,20 @@ impl LayerInner {
/// `DownloadedLayer` is being dropped, so it calls this method.
fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
let can_evict = self.have_remote_client;
// we cannot know without inspecting LayerInner::inner if we should evict or not, even
// though here it is very likely
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
if !can_evict {
// it would be nice to assert this case out, but we are in drop
span.in_scope(|| {
tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
});
return;
}
// NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
// drop while the `self.inner` is being locked, leading to a deadlock.
@@ -1555,6 +1578,8 @@ pub(crate) enum EvictionError {
pub(crate) enum DownloadError {
#[error("timeline has already shutdown")]
TimelineShutdown,
#[error("no remote storage configured")]
NoRemoteStorage,
#[error("context denies downloading")]
ContextAndConfigReallyDeniesDownloads,
#[error("downloading is really required but not allowed by this method")]

View File

@@ -145,7 +145,7 @@ async fn smoke_test() {
.await
.expect("the local layer file still exists");
let rtc = &timeline.remote_client;
let rtc = timeline.remote_client.as_ref().unwrap();
{
let layers = &[layer];
@@ -761,7 +761,13 @@ async fn eviction_cancellation_on_drop() {
timeline.freeze_and_flush().await.unwrap();
// wait for the upload to complete so our Arc::strong_count assertion holds
timeline.remote_client.wait_completion().await.unwrap();
timeline
.remote_client
.as_ref()
.unwrap()
.wait_completion()
.await
.unwrap();
let (evicted_layer, not_evicted) = {
let mut layers = {

View File

@@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn};
use crate::repository::Key;
use super::{DeltaLayerName, ImageLayerName, LayerName};
use super::{DeltaFileName, ImageFileName, LayerFileName};
use serde::{Deserialize, Serialize};
@@ -51,7 +51,7 @@ impl PersistentLayerDesc {
}
pub fn short_id(&self) -> impl Display {
self.layer_name()
self.filename()
}
#[cfg(test)]
@@ -103,14 +103,14 @@ impl PersistentLayerDesc {
pub fn from_filename(
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
filename: LayerName,
filename: LayerFileName,
file_size: u64,
) -> Self {
match filename {
LayerName::Image(i) => {
LayerFileName::Image(i) => {
Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
}
LayerName::Delta(d) => Self::new_delta(
LayerFileName::Delta(d) => Self::new_delta(
tenant_shard_id,
timeline_id,
d.key_range,
@@ -132,34 +132,34 @@ impl PersistentLayerDesc {
lsn..(lsn + 1)
}
/// Get a delta layer name for this layer.
/// Get a delta file name for this layer.
///
/// Panic: if this is not a delta layer.
pub fn delta_layer_name(&self) -> DeltaLayerName {
pub fn delta_file_name(&self) -> DeltaFileName {
assert!(self.is_delta);
DeltaLayerName {
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
}
/// Get a image layer name for this layer.
/// Get a delta file name for this layer.
///
/// Panic: if this is not an image layer, or the lsn range is invalid
pub fn image_layer_name(&self) -> ImageLayerName {
pub fn image_file_name(&self) -> ImageFileName {
assert!(!self.is_delta);
assert!(self.lsn_range.start + 1 == self.lsn_range.end);
ImageLayerName {
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
}
}
pub fn layer_name(&self) -> LayerName {
pub fn filename(&self) -> LayerFileName {
if self.is_delta {
self.delta_layer_name().into()
self.delta_file_name().into()
} else {
self.image_layer_name().into()
self.image_file_name().into()
}
}

View File

@@ -41,7 +41,7 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore
tokio::sync::Semaphore::new(permits)
});
#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
#[strum(serialize_all = "snake_case")]
pub(crate) enum BackgroundLoopKind {
Compaction,
@@ -57,25 +57,19 @@ pub(crate) enum BackgroundLoopKind {
impl BackgroundLoopKind {
fn as_static_str(&self) -> &'static str {
self.into()
let s: &'static str = self.into();
s
}
}
static PERMIT_GAUGES: once_cell::sync::Lazy<
enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
> = once_cell::sync::Lazy::new(|| {
enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
}))
});
/// Cancellation safe.
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind: BackgroundLoopKind,
_ctx: &RequestContext,
) -> tokio::sync::SemaphorePermit<'static> {
let _guard = PERMIT_GAUGES[loop_kind].guard();
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
.with_label_values(&[loop_kind.as_static_str()])
.guard();
pausable_failpoint!(
"initial-size-calculation-permit-pause",

View File

@@ -23,7 +23,7 @@ use pageserver_api::{
},
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
models::{
AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
TimelineState,
},
@@ -60,13 +60,10 @@ use std::{
ops::ControlFlow,
};
use crate::tenant::timeline::init::LocalLayerFileMetadata;
use crate::{
aux_file::AuxFileSizeEstimator,
tenant::{
layer_map::{LayerMap, SearchResult},
metadata::TimelineMetadata,
},
use crate::tenant::storage_layer::layer::local_layer_path;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
metadata::TimelineMetadata,
};
use crate::{
context::{DownloadBehavior, RequestContext},
@@ -78,7 +75,7 @@ use crate::{
disk_usage_eviction_task::finite_f32,
tenant::storage_layer::{
AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
ValueReconstructState, ValuesReconstructState,
},
};
@@ -200,7 +197,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: RemoteTimelineClient,
pub remote_client: Option<RemoteTimelineClient>,
pub deletion_queue_client: DeletionQueueClient,
pub timeline_get_throttle: Arc<
crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
@@ -272,7 +269,7 @@ pub struct Timeline {
/// Remote storage client.
/// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
pub remote_client: Arc<RemoteTimelineClient>,
pub remote_client: Option<Arc<RemoteTimelineClient>>,
// What page versions do we hold in the repository? If we get a
// request > last_record_lsn, we need to wait until we receive all
@@ -412,12 +409,6 @@ pub struct Timeline {
/// Keep aux directory cache to avoid it's reconstruction on each update
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
/// Size estimator for aux file v2
pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
/// Indicate whether aux file v2 storage is enabled.
pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
}
pub struct WalReceiverInfo {
@@ -1229,17 +1220,11 @@ impl Timeline {
}
reconstruct_timer.stop_and_record();
// For aux file keys (v1 or v2) the vectored read path does not return an error
// when they're missing. Instead they are omitted from the resulting btree
// (this is a requirement, not a bug). Skip updating the metric in these cases
// to avoid infinite results.
if !results.is_empty() {
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient
// (i.e. segment tree tracking each range queried from a layer)
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
.observe(layers_visited as f64 / results.len() as f64);
}
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient
// (i.e. segment tree tracking each range queried from a layer)
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
.observe(layers_visited as f64 / results.len() as f64);
Ok(results)
}
@@ -1379,14 +1364,22 @@ impl Timeline {
/// not validated with control plane yet.
/// See [`Self::get_remote_consistent_lsn_visible`].
pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
self.remote_client.remote_consistent_lsn_projected()
if let Some(remote_client) = &self.remote_client {
remote_client.remote_consistent_lsn_projected()
} else {
None
}
}
/// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
/// i.e. a value of remote_consistent_lsn_projected which has undergone
/// generation validation in the deletion queue.
pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
self.remote_client.remote_consistent_lsn_visible()
if let Some(remote_client) = &self.remote_client {
remote_client.remote_consistent_lsn_visible()
} else {
None
}
}
/// The sum of the file size of all historic layers in the layer map.
@@ -1756,14 +1749,16 @@ impl Timeline {
match self.freeze_and_flush().await {
Ok(_) => {
// drain the upload queue
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
self.remote_client.shutdown().await;
if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.shutdown().await;
}
}
Err(e) => {
// Non-fatal. Shutdown is infallible. Failures to flush just mean that
@@ -1779,16 +1774,18 @@ impl Timeline {
// Transition the remote_client into a state where it's only useful for timeline deletion.
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
self.remote_client.stop();
// As documented in remote_client.stop()'s doc comment, it's our responsibility
// to shut down the upload queue tasks.
// TODO: fix that, task management should be encapsulated inside remote_client.
task_mgr::shutdown_tasks(
Some(TaskKind::RemoteUploadTask),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
if let Some(remote_client) = self.remote_client.as_ref() {
remote_client.stop();
// As documented in remote_client.stop()'s doc comment, it's our responsibility
// to shut down the upload queue tasks.
// TODO: fix that, task management should be encapsulated inside remote_client.
task_mgr::shutdown_tasks(
Some(TaskKind::RemoteUploadTask),
Some(self.tenant_shard_id),
Some(self.timeline_id),
)
.await;
}
// TODO: work toward making this a no-op. See this funciton's doc comment for more context.
tracing::debug!("Waiting for tasks...");
@@ -1908,12 +1905,16 @@ impl Timeline {
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
pub(crate) async fn download_layer(
&self,
layer_file_name: &LayerName,
layer_file_name: &LayerFileName,
) -> anyhow::Result<Option<bool>> {
let Some(layer) = self.find_layer(layer_file_name).await else {
return Ok(None);
};
if self.remote_client.is_none() {
return Ok(Some(false));
}
layer.download().await?;
Ok(Some(true))
@@ -1924,7 +1925,7 @@ impl Timeline {
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
pub(crate) async fn evict_layer(
&self,
layer_file_name: &LayerName,
layer_file_name: &LayerFileName,
) -> anyhow::Result<Option<bool>> {
let _gate = self
.gate
@@ -2137,7 +2138,6 @@ impl Timeline {
resources: TimelineResources,
pg_version: u32,
state: TimelineState,
aux_file_policy: Option<AuxFilePolicy>,
cancel: CancellationToken,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2155,16 +2155,6 @@ impl Timeline {
};
Arc::new_cyclic(|myself| {
let metrics = TimelineMetrics::new(
&tenant_shard_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
evictions_low_residence_duration_metric_threshold,
),
);
let aux_file_metrics = metrics.aux_file_size_gauge.clone();
let mut result = Timeline {
conf,
tenant_conf,
@@ -2179,7 +2169,7 @@ impl Timeline {
walredo_mgr,
walreceiver: Mutex::new(None),
remote_client: Arc::new(resources.remote_client),
remote_client: resources.remote_client.map(Arc::new),
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
last_record_lsn: SeqWait::new(RecordLsn {
@@ -2196,7 +2186,14 @@ impl Timeline {
ancestor_timeline: ancestor,
ancestor_lsn: metadata.ancestor_lsn(),
metrics,
metrics: TimelineMetrics::new(
&tenant_shard_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
evictions_low_residence_duration_metric_threshold,
),
),
query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
&tenant_shard_id,
@@ -2260,10 +2257,6 @@ impl Timeline {
dir: None,
n_deltas: 0,
}),
aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2394,13 +2387,13 @@ impl Timeline {
index_part: Option<IndexPart>,
) -> anyhow::Result<()> {
use init::{Decision::*, Discovered, DismissedLayer};
use LayerName::*;
use LayerFileName::*;
let mut guard = self.layers.write().await;
let timer = self.metrics.load_layer_map_histo.start_timer();
// Scan timeline directory and create ImageLayerName and DeltaFilename
// Scan timeline directory and create ImageFileName and DeltaFilename
// structs representing all files on disk
let timeline_path = self
.conf
@@ -2428,6 +2421,10 @@ impl Timeline {
discovered_layers.push((layer_file_name, local_path, file_size));
continue;
}
Discovered::Metadata => {
warn!("found legacy metadata file, these should have been removed in load_tenant_config");
continue;
}
Discovered::IgnoredBackup => {
continue;
}
@@ -2466,37 +2463,33 @@ impl Timeline {
let mut needs_cleanup = Vec::new();
let mut total_physical_size = 0;
for (name, decision) in decided {
for (name, local_path, decision) in decided {
let decision = match decision {
Ok(UseRemote { local, remote }) => {
// Remote is authoritative, but we may still choose to retain
// the local file if the contents appear to match
if local.metadata.file_size() == remote.file_size() {
if local.file_size() == remote.file_size() {
// Use the local file, but take the remote metadata so that we pick up
// the correct generation.
UseLocal(LocalLayerFileMetadata {
metadata: remote,
local_path: local.local_path,
})
UseLocal(remote)
} else {
init::cleanup_local_file_for_remote(&local, &remote)?;
let local_path = local_path.as_ref().expect("Locally found layer must have path");
init::cleanup_local_file_for_remote(local_path, &local, &remote)?;
UseRemote { local, remote }
}
}
Ok(decision) => decision,
Err(DismissedLayer::Future { local }) => {
if let Some(local) = local {
init::cleanup_future_layer(
&local.local_path,
&name,
disk_consistent_lsn,
)?;
if local.is_some() {
let local_path = local_path.expect("Locally found layer must have path");
init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?;
}
needs_cleanup.push(name);
continue;
}
Err(DismissedLayer::LocalOnly(local)) => {
init::cleanup_local_only_file(&name, &local)?;
let local_path = local_path.expect("Locally found layer must have path");
init::cleanup_local_only_file(&local_path, &name, &local)?;
// this file never existed remotely, we will have to do rework
continue;
}
@@ -2510,10 +2503,20 @@ impl Timeline {
tracing::debug!(layer=%name, ?decision, "applied");
let layer = match decision {
UseLocal(local) => {
total_physical_size += local.metadata.file_size();
Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
.drop_eviction_guard()
UseLocal(m) => {
total_physical_size += m.file_size();
let local_path = local_path.unwrap_or_else(|| {
local_layer_path(
conf,
&this.tenant_shard_id,
&this.timeline_id,
&name,
&m.generation,
)
});
Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard()
}
Evicted(remote) | UseRemote { remote, .. } => {
Layer::for_evicted(conf, &this, name, remote)
@@ -2533,36 +2536,36 @@ impl Timeline {
guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
self.remote_client
.schedule_layer_file_deletion(&needs_cleanup)?;
self.remote_client
.schedule_index_upload_for_file_changes()?;
// This barrier orders above DELETEs before any later operations.
// This is critical because code executing after the barrier might
// create again objects with the same key that we just scheduled for deletion.
// For example, if we just scheduled deletion of an image layer "from the future",
// later compaction might run again and re-create the same image layer.
// "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
// "same" here means same key range and LSN.
//
// Without a barrier between above DELETEs and the re-creation's PUTs,
// the upload queue may execute the PUT first, then the DELETE.
// In our example, we will end up with an IndexPart referencing a non-existent object.
//
// 1. a future image layer is created and uploaded
// 2. ps restart
// 3. the future layer from (1) is deleted during load layer map
// 4. image layer is re-created and uploaded
// 5. deletion queue would like to delete (1) but actually deletes (4)
// 6. delete by name works as expected, but it now deletes the wrong (later) version
//
// See https://github.com/neondatabase/neon/issues/5878
//
// NB: generation numbers naturally protect against this because they disambiguate
// (1) and (4)
self.remote_client.schedule_barrier()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
if let Some(rtc) = self.remote_client.as_ref() {
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
rtc.schedule_index_upload_for_file_changes()?;
// This barrier orders above DELETEs before any later operations.
// This is critical because code executing after the barrier might
// create again objects with the same key that we just scheduled for deletion.
// For example, if we just scheduled deletion of an image layer "from the future",
// later compaction might run again and re-create the same image layer.
// "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
// "same" here means same key range and LSN.
//
// Without a barrier between above DELETEs and the re-creation's PUTs,
// the upload queue may execute the PUT first, then the DELETE.
// In our example, we will end up with an IndexPart referencing a non-existent object.
//
// 1. a future image layer is created and uploaded
// 2. ps restart
// 3. the future layer from (1) is deleted during load layer map
// 4. image layer is re-created and uploaded
// 5. deletion queue would like to delete (1) but actually deletes (4)
// 6. delete by name works as expected, but it now deletes the wrong (later) version
//
// See https://github.com/neondatabase/neon/issues/5878
//
// NB: generation numbers naturally protect against this because they disambiguate
// (1) and (4)
rtc.schedule_barrier()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
}
info!(
"loaded layer map with {} layers at {}, total physical size: {}",
@@ -2621,7 +2624,6 @@ impl Timeline {
// Don't make noise.
} else {
warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
debug_assert!(false);
}
}
};
@@ -2995,10 +2997,10 @@ impl Timeline {
}
}
async fn find_layer(&self, layer_name: &LayerName) -> Option<Layer> {
async fn find_layer(&self, layer_name: &LayerFileName) -> Option<Layer> {
let guard = self.layers.read().await;
for historic_layer in guard.layer_map().iter_historic_layers() {
let historic_layer_name = historic_layer.layer_name();
let historic_layer_name = historic_layer.filename();
if layer_name == &historic_layer_name {
return Some(guard.get_from_desc(&historic_layer));
}
@@ -3015,6 +3017,9 @@ impl Timeline {
/// should treat this as a cue to simply skip doing any heatmap uploading
/// for this timeline.
pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
// no point in heatmaps without remote client
let _remote_client = self.remote_client.as_ref()?;
if !self.is_active() {
return None;
}
@@ -3025,7 +3030,7 @@ impl Timeline {
let last_activity_ts = layer.access_stats().latest_activity_or_now();
HeatMapLayer::new(
layer.layer_desc().layer_name(),
layer.layer_desc().filename(),
(&layer.metadata()).into(),
last_activity_ts,
)
@@ -3035,15 +3040,6 @@ impl Timeline {
Some(HeatMapTimeline::new(self.timeline_id, layers))
}
/// Returns true if the given lsn is or was an ancestor branchpoint.
pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool {
// upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original
// branchpoint in the value in IndexPart::lineage
self.ancestor_lsn == lsn
|| (self.ancestor_lsn == Lsn::INVALID
&& self.remote_client.is_previous_ancestor_lsn(lsn))
}
}
type TraversalId = Arc<str>;
@@ -3181,7 +3177,7 @@ impl Timeline {
if let Some(open_layer) = &layers.open_layer {
let start_lsn = open_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
//info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, start_lsn);
@@ -3210,7 +3206,7 @@ impl Timeline {
for frozen_layer in layers.frozen_layers.iter().rev() {
let start_lsn = frozen_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
let lsn_floor = max(cached_lsn + 1, start_lsn);
let frozen_layer = frozen_layer.clone();
@@ -3555,11 +3551,7 @@ impl Timeline {
///
/// Get a handle to the latest layer for appending.
///
async fn get_layer_for_write(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<Arc<InMemoryLayer>> {
async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
let mut guard = self.layers.write().await;
let layer = guard
.get_layer_for_write(
@@ -3568,7 +3560,6 @@ impl Timeline {
self.conf,
self.timeline_id,
self.tenant_shard_id,
ctx,
)
.await?;
Ok(layer)
@@ -3833,8 +3824,8 @@ impl Timeline {
);
self.create_delta_layer(
&frozen_layer,
Some(metadata_keyspace.0.ranges[0].clone()),
ctx,
Some(metadata_keyspace.0.ranges[0].clone()),
)
.await?
} else {
@@ -3863,7 +3854,7 @@ impl Timeline {
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
panic!("delta layer cannot be empty if no filter is applied");
};
(
@@ -3962,23 +3953,29 @@ impl Timeline {
x.unwrap()
));
for layer in layers_to_upload {
self.remote_client.schedule_layer_file_upload(layer)?;
if let Some(remote_client) = &self.remote_client {
for layer in layers_to_upload {
remote_client.schedule_layer_file_upload(layer)?;
}
remote_client.schedule_index_upload_for_metadata_update(&update)?;
}
self.remote_client
.schedule_index_upload_for_metadata_update(&update)?;
Ok(())
}
pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
self.remote_client
.preserve_initdb_archive(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
&self.cancel,
)
.await
if let Some(remote_client) = &self.remote_client {
remote_client
.preserve_initdb_archive(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
&self.cancel,
)
.await?;
} else {
bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
}
Ok(())
}
// Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
@@ -3986,8 +3983,8 @@ impl Timeline {
async fn create_delta_layer(
self: &Arc<Self>,
frozen_layer: &Arc<InMemoryLayer>,
key_range: Option<Range<Key>>,
ctx: &RequestContext,
key_range: Option<Range<Key>>,
) -> anyhow::Result<Option<ResidentLayer>> {
let self_clone = Arc::clone(self);
let frozen_layer = Arc::clone(frozen_layer);
@@ -4010,7 +4007,6 @@ impl Timeline {
&self_clone
.conf
.timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
&ctx,
)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
@@ -4204,7 +4200,6 @@ impl Timeline {
self.tenant_shard_id,
&img_range,
lsn,
ctx,
)
.await?;
@@ -4238,7 +4233,7 @@ impl Timeline {
// Maybe flush `key_rest_accum`
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|| last_key_in_range
{
let results = self
.get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
@@ -4309,7 +4304,6 @@ impl Timeline {
&self
.conf
.timeline_path(&self.tenant_shard_id, &self.timeline_id),
ctx,
)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
@@ -4334,16 +4328,6 @@ impl Timeline {
/// this Timeline is shut down. Calling this function will cause the initial
/// logical size calculation to skip waiting for the background jobs barrier.
pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
if !self.shard_identity.is_shard_zero() {
// We don't populate logical size on shard >0: skip waiting for it.
return;
}
if self.remote_client.is_deleting() {
// The timeline was created in a deletion-resume state, we don't expect logical size to be populated
return;
}
if let Some(await_bg_cancel) = self
.current_logical_size
.cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -4355,10 +4339,9 @@ impl Timeline {
// the logical size cancellation to skip the concurrency limit semaphore.
// TODO: this is an unexpected case. We should restructure so that it
// can't happen.
tracing::warn!(
tracing::info!(
"await_initial_logical_size: can't get semaphore cancel token, skipping"
);
debug_assert!(false);
}
tokio::select!(
@@ -4374,6 +4357,7 @@ impl Timeline {
/// - has an ancestor to detach from
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
/// a technical requirement
/// - has prev_lsn in remote storage (temporary restriction)
///
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
/// polled again until completion.
@@ -4507,8 +4491,9 @@ impl Timeline {
// deletion will happen later, the layer file manager calls garbage_collect_on_drop
guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
self.remote_client
.schedule_compaction_update(&remove_layers, new_deltas)?;
if let Some(remote_client) = self.remote_client.as_ref() {
remote_client.schedule_compaction_update(&remove_layers, new_deltas)?;
}
drop_wlock(guard);
@@ -4526,8 +4511,9 @@ impl Timeline {
let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
self.remote_client
.schedule_compaction_update(&drop_layers, &upload_layers)?;
if let Some(remote_client) = self.remote_client.as_ref() {
remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
}
Ok(())
}
@@ -4537,14 +4523,16 @@ impl Timeline {
self: &Arc<Self>,
new_images: impl IntoIterator<Item = ResidentLayer>,
) -> anyhow::Result<()> {
let Some(remote_client) = &self.remote_client else {
return Ok(());
};
for layer in new_images {
self.remote_client.schedule_layer_file_upload(layer)?;
remote_client.schedule_layer_file_upload(layer)?;
}
// should any new image layer been created, not uploading index_part will
// result in a mismatch between remote_physical_size and layermap calculated
// size, which will fail some tests, but should not be an issue otherwise.
self.remote_client
.schedule_index_upload_for_file_changes()?;
remote_client.schedule_index_upload_for_file_changes()?;
Ok(())
}
@@ -4642,9 +4630,11 @@ impl Timeline {
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
// this is most likely the background tasks, but it might be the spawned task from
// immediate_gc
let cancel = crate::task_mgr::shutdown_token();
let _g = tokio::select! {
guard = self.gc_lock.lock() => guard,
_ = self.cancel.cancelled() => return Ok(GcResult::default()),
_ = cancel.cancelled() => return Ok(GcResult::default()),
};
let timer = self.metrics.garbage_collect_histo.start_timer();
@@ -4741,7 +4731,7 @@ impl Timeline {
if l.get_lsn_range().end > horizon_cutoff {
debug!(
"keeping {} because it's newer than horizon_cutoff {}",
l.layer_name(),
l.filename(),
horizon_cutoff,
);
result.layers_needed_by_cutoff += 1;
@@ -4752,7 +4742,7 @@ impl Timeline {
if l.get_lsn_range().end > pitr_cutoff {
debug!(
"keeping {} because it's newer than pitr_cutoff {}",
l.layer_name(),
l.filename(),
pitr_cutoff,
);
result.layers_needed_by_pitr += 1;
@@ -4771,7 +4761,7 @@ impl Timeline {
if &l.get_lsn_range().start <= retain_lsn {
debug!(
"keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
l.layer_name(),
l.filename(),
retain_lsn,
l.is_incremental(),
);
@@ -4802,7 +4792,7 @@ impl Timeline {
if !layers
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
{
debug!("keeping {} because it is the latest layer", l.layer_name());
debug!("keeping {} because it is the latest layer", l.filename());
result.layers_not_updated += 1;
continue 'outer;
}
@@ -4810,7 +4800,7 @@ impl Timeline {
// We didn't find any reason to keep this file, so remove it.
debug!(
"garbage collecting {} is_dropped: xx is_incremental: {}",
l.layer_name(),
l.filename(),
l.is_incremental(),
);
layers_to_remove.push(l);
@@ -4830,7 +4820,9 @@ impl Timeline {
result.layers_removed = gc_layers.len() as u64;
self.remote_client.schedule_gc_update(&gc_layers)?;
if let Some(remote_client) = self.remote_client.as_ref() {
remote_client.schedule_gc_update(&gc_layers)?;
}
guard.finish_gc_timeline(&gc_layers);
@@ -5214,7 +5206,7 @@ impl<'a> TimelineWriter<'a> {
let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
let action = self.get_open_layer_action(lsn, buf_size);
let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
let layer = self.handle_open_layer_action(lsn, action).await?;
let res = layer.put_value(key, lsn, &buf, ctx).await;
if res.is_ok() {
@@ -5237,15 +5229,14 @@ impl<'a> TimelineWriter<'a> {
&mut self,
at: Lsn,
action: OpenLayerAction,
ctx: &RequestContext,
) -> anyhow::Result<&Arc<InMemoryLayer>> {
match action {
OpenLayerAction::Roll => {
let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
self.roll_layer(freeze_at).await?;
self.open_layer(at, ctx).await?;
self.open_layer(at).await?;
}
OpenLayerAction::Open => self.open_layer(at, ctx).await?,
OpenLayerAction::Open => self.open_layer(at).await?,
OpenLayerAction::None => {
assert!(self.write_guard.is_some());
}
@@ -5254,8 +5245,8 @@ impl<'a> TimelineWriter<'a> {
Ok(&self.write_guard.as_ref().unwrap().open_layer)
}
async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> {
let layer = self.tl.get_layer_for_write(at, ctx).await?;
async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> {
let layer = self.tl.get_layer_for_write(at).await?;
let initial_size = layer.size().await?;
let last_freeze_at = self.last_freeze_at.load();
@@ -5332,14 +5323,10 @@ impl<'a> TimelineWriter<'a> {
Ok(())
}
pub(crate) async fn delete_batch(
&mut self,
batch: &[(Range<Key>, Lsn)],
ctx: &RequestContext,
) -> anyhow::Result<()> {
pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
if let Some((_, lsn)) = batch.first() {
let action = self.get_open_layer_action(*lsn, 0);
let layer = self.handle_open_layer_action(*lsn, action, ctx).await?;
let layer = self.handle_open_layer_action(*lsn, action).await?;
layer.put_tombstones(batch).await?;
}

View File

@@ -295,11 +295,13 @@ impl Timeline {
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
self.rewrite_layers(replace_layers, drop_layers).await?;
// We wait for all uploads to complete before finishing this compaction stage. This is not
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
// load.
self.remote_client.wait_completion().await?;
if let Some(remote_client) = self.remote_client.as_ref() {
// We wait for all uploads to complete before finishing this compaction stage. This is not
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
// load.
remote_client.wait_completion().await?;
}
Ok(())
}
@@ -698,7 +700,6 @@ impl Timeline {
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
lsn_range.clone()
},
ctx,
)
.await?,
);
@@ -754,7 +755,6 @@ impl Timeline {
&self
.conf
.timeline_path(&self.tenant_shard_id, &self.timeline_id),
ctx,
)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
@@ -1093,7 +1093,6 @@ impl CompactionJobExecutor for TimelineAdaptor {
self.timeline.tenant_shard_id,
key_range.start,
lsn_range.clone(),
ctx,
)
.await?;
@@ -1168,7 +1167,6 @@ impl TimelineAdaptor {
self.timeline.tenant_shard_id,
key_range,
lsn,
ctx,
)
.await?;

View File

@@ -26,21 +26,19 @@ use super::{Timeline, TimelineResources};
/// during attach or pageserver restart.
/// See comment in persist_index_part_with_deleted_flag.
async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
match timeline
.remote_client
.persist_index_part_with_deleted_flag()
.await
{
// If we (now, or already) marked it successfully as deleted, we can proceed
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
// Bail out otherwise
//
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
// two tasks from performing the deletion at the same time. The first task
// that starts deletion should run it to completion.
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
if let Some(remote_client) = timeline.remote_client.as_ref() {
match remote_client.persist_index_part_with_deleted_flag().await {
// If we (now, or already) marked it successfully as deleted, we can proceed
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
// Bail out otherwise
//
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
// two tasks from performing the deletion at the same time. The first task
// that starts deletion should run it to completion.
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
}
}
}
Ok(())
@@ -119,11 +117,11 @@ pub(super) async fn delete_local_timeline_directory(
/// Removes remote layers and an index file after them.
async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
timeline
.remote_client
.delete_all()
.await
.context("delete_all")
if let Some(remote_client) = &timeline.remote_client {
remote_client.delete_all().await.context("delete_all")?
};
Ok(())
}
// This function removs remaining traces of a timeline on disk.
@@ -262,7 +260,7 @@ impl DeleteTimelineFlow {
tenant: Arc<Tenant>,
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: RemoteTimelineClient,
remote_client: Option<RemoteTimelineClient>,
deletion_queue_client: DeletionQueueClient,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -280,8 +278,6 @@ impl DeleteTimelineFlow {
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.
CreateTimelineCause::Delete,
// Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
None,
)
.context("create_timeline_struct")?;

View File

@@ -22,6 +22,8 @@ pub(crate) enum Error {
TooManyAncestors,
#[error("shutting down, please retry later")]
ShuttingDown,
#[error("detached timeline must receive writes before the operation")]
DetachedTimelineNeedsWrites,
#[error("flushing failed")]
FlushAncestor(#[source] anyhow::Error),
#[error("layer download failed")]
@@ -56,7 +58,7 @@ impl Default for Options {
fn default() -> Self {
Self {
rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
}
}
}
@@ -70,6 +72,10 @@ pub(super) async fn prepare(
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
use Error::*;
if detached.remote_client.as_ref().is_none() {
unimplemented!("no new code for running without remote storage");
}
let Some((ancestor, ancestor_lsn)) = detached
.ancestor_timeline
.as_ref()
@@ -88,6 +94,14 @@ pub(super) async fn prepare(
return Err(TooManyAncestors);
}
if detached.get_prev_record_lsn() == Lsn::INVALID
|| detached.disk_consistent_lsn.load() == ancestor_lsn
{
// this is to avoid a problem that after detaching we would be unable to start up the
// compute because of "PREV_LSN: invalid".
return Err(DetachedTimelineNeedsWrites);
}
// before we acquire the gate, we must mark the ancestor as having a detach operation
// ongoing which will block other concurrent detach operations so we don't get to ackward
// situations where there would be two branches trying to reparent earlier branches.
@@ -211,7 +225,6 @@ pub(super) async fn prepare(
&detached
.conf
.timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
ctx,
)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
@@ -311,6 +324,8 @@ async fn upload_rewritten_layer(
// FIXME: better shuttingdown error
target
.remote_client
.as_ref()
.unwrap()
.upload_layer_file(&copied, cancel)
.await
.map_err(UploadRewritten)?;
@@ -334,7 +349,6 @@ async fn copy_lsn_prefix(
target_timeline.tenant_shard_id,
layer.layer_desc().key_range.start,
layer.layer_desc().lsn_range.start..end_lsn,
ctx,
)
.await
.map_err(CopyDeltaPrefix)?;
@@ -393,13 +407,15 @@ async fn remote_copy(
let owned = crate::tenant::storage_layer::Layer::for_evicted(
adoptee.conf,
adoptee,
adopted.layer_desc().layer_name(),
adopted.layer_desc().filename(),
metadata,
);
// FIXME: better shuttingdown error
adoptee
.remote_client
.as_ref()
.unwrap()
.copy_timeline_layer(adopted, &owned, cancel)
.await
.map(move |()| owned)
@@ -413,6 +429,11 @@ pub(super) async fn complete(
prepared: PreparedTimelineDetach,
_ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
let rtc = detached
.remote_client
.as_ref()
.expect("has to have a remote timeline client for timeline ancestor detach");
let PreparedTimelineDetach { layers } = prepared;
let ancestor = detached
@@ -429,13 +450,11 @@ pub(super) async fn complete(
//
// this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
// which could give us a completely wrong layer combination.
detached
.remote_client
.schedule_adding_existing_layers_to_index_detach_and_wait(
&layers,
(ancestor.timeline_id, ancestor_lsn),
)
.await?;
rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
&layers,
(ancestor.timeline_id, ancestor_lsn),
)
.await?;
let mut tasks = tokio::task::JoinSet::new();
@@ -480,6 +499,8 @@ pub(super) async fn complete(
async move {
let res = timeline
.remote_client
.as_ref()
.expect("reparented has to have remote client because detached has one")
.schedule_reparenting_and_wait(&new_parent)
.await;

View File

@@ -23,7 +23,7 @@ use std::{
use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, info_span, instrument, warn, Instrument};
use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
use crate::{
context::{DownloadBehavior, RequestContext},
@@ -211,6 +211,11 @@ impl Timeline {
// So, we just need to deal with this.
if self.remote_client.is_none() {
error!("no remote storage configured, cannot evict layers");
return ControlFlow::Continue(());
}
let mut js = tokio::task::JoinSet::new();
{
let guard = self.layers.read().await;

View File

@@ -6,9 +6,10 @@ use crate::{
self,
index::{IndexPart, LayerFileMetadata},
},
storage_layer::LayerName,
storage_layer::LayerFileName,
Generation,
},
METADATA_FILE_NAME,
};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
@@ -19,13 +20,15 @@ use utils::lsn::Lsn;
/// Identified files in the timeline directory.
pub(super) enum Discovered {
/// The only one we care about
Layer(LayerName, Utf8PathBuf, u64),
Layer(LayerFileName, Utf8PathBuf, u64),
/// Old ephmeral files from previous launches, should be removed
Ephemeral(String),
/// Old temporary timeline files, unsure what these really are, should be removed
Temporary(String),
/// Temporary on-demand download files, should be removed
TemporaryDownload(String),
/// "metadata" file we persist locally and include in `index_part.json`
Metadata,
/// Backup file from previously future layers
IgnoredBackup,
/// Unrecognized, warn about these
@@ -40,13 +43,15 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
let direntry = direntry?;
let file_name = direntry.file_name().to_string();
let discovered = match LayerName::from_str(&file_name) {
let discovered = match LayerFileName::from_str(&file_name) {
Ok(file_name) => {
let file_size = direntry.metadata()?.len();
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
}
Err(_) => {
if file_name.ends_with(".old") {
if file_name == METADATA_FILE_NAME {
Discovered::Metadata
} else if file_name.ends_with(".old") {
// ignore these
Discovered::IgnoredBackup
} else if remote_timeline_client::is_temp_download_file(direntry.path()) {
@@ -67,28 +72,6 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
Ok(ret)
}
/// Whereas `LayerFileMetadata` describes the metadata we would store in remote storage,
/// this structure extends it with metadata describing the layer's presence in local storage.
#[derive(Clone, Debug)]
pub(super) struct LocalLayerFileMetadata {
pub(super) metadata: LayerFileMetadata,
pub(super) local_path: Utf8PathBuf,
}
impl LocalLayerFileMetadata {
pub fn new(
local_path: Utf8PathBuf,
file_size: u64,
generation: Generation,
shard: ShardIndex,
) -> Self {
Self {
local_path,
metadata: LayerFileMetadata::new(file_size, generation, shard),
}
}
}
/// Decision on what to do with a layer file after considering its local and remote metadata.
#[derive(Clone, Debug)]
pub(super) enum Decision {
@@ -97,11 +80,11 @@ pub(super) enum Decision {
/// The layer is present locally, but local metadata does not match remote; we must
/// delete it and treat it as evicted.
UseRemote {
local: LocalLayerFileMetadata,
local: LayerFileMetadata,
remote: LayerFileMetadata,
},
/// The layer is present locally, and metadata matches.
UseLocal(LocalLayerFileMetadata),
UseLocal(LayerFileMetadata),
}
/// A layer needs to be left out of the layer map.
@@ -109,29 +92,39 @@ pub(super) enum Decision {
pub(super) enum DismissedLayer {
/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
Future {
/// `None` if the layer is only known through [`IndexPart`].
local: Option<LocalLayerFileMetadata>,
/// The local metadata. `None` if the layer is only known through [`IndexPart`].
local: Option<LayerFileMetadata>,
},
/// The layer only exists locally.
///
/// In order to make crash safe updates to layer map, we must dismiss layers which are only
/// found locally or not yet included in the remote `index_part.json`.
LocalOnly(LocalLayerFileMetadata),
LocalOnly(LayerFileMetadata),
}
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
pub(super) fn reconcile(
discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>,
index_part: Option<&IndexPart>,
disk_consistent_lsn: Lsn,
generation: Generation,
shard: ShardIndex,
) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
) -> Vec<(
LayerFileName,
Option<Utf8PathBuf>,
Result<Decision, DismissedLayer>,
)> {
use Decision::*;
// name => (local_metadata, remote_metadata)
type Collected =
HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
// name => (local_path, local_metadata, remote_metadata)
type Collected = HashMap<
LayerFileName,
(
Option<Utf8PathBuf>,
Option<LayerFileMetadata>,
Option<LayerFileMetadata>,
),
>;
let mut discovered = discovered
.into_iter()
@@ -142,9 +135,8 @@ pub(super) fn reconcile(
// it is not in IndexPart, in which case using our current generation makes sense
// because it will be uploaded in this generation.
(
Some(LocalLayerFileMetadata::new(
local_path, file_size, generation, shard,
)),
Some(local_path),
Some(LayerFileMetadata::new(file_size, generation, shard)),
None,
),
)
@@ -160,20 +152,20 @@ pub(super) fn reconcile(
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
.for_each(|(name, metadata)| {
if let Some(existing) = discovered.get_mut(name) {
existing.1 = Some(metadata);
existing.2 = Some(metadata);
} else {
discovered.insert(name.to_owned(), (None, Some(metadata)));
discovered.insert(name.to_owned(), (None, None, Some(metadata)));
}
});
discovered
.into_iter()
.map(|(name, (local, remote))| {
.map(|(name, (local_path, local, remote))| {
let decision = if name.is_in_future(disk_consistent_lsn) {
Err(DismissedLayer::Future { local })
} else {
match (local, remote) {
(Some(local), Some(remote)) if local.metadata != remote => {
(Some(local), Some(remote)) if local != remote => {
Ok(UseRemote { local, remote })
}
(Some(x), Some(_)) => Ok(UseLocal(x)),
@@ -185,7 +177,7 @@ pub(super) fn reconcile(
}
};
(name, decision)
(name, local_path, decision)
})
.collect::<Vec<_>>()
}
@@ -197,12 +189,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
}
pub(super) fn cleanup_local_file_for_remote(
local: &LocalLayerFileMetadata,
path: &Utf8Path,
local: &LayerFileMetadata,
remote: &LayerFileMetadata,
) -> anyhow::Result<()> {
let local_size = local.metadata.file_size();
let local_size = local.file_size();
let remote_size = remote.file_size();
let path = &local.local_path;
let file_name = path.file_name().expect("must be file path");
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
@@ -219,7 +211,7 @@ pub(super) fn cleanup_local_file_for_remote(
pub(super) fn cleanup_future_layer(
path: &Utf8Path,
name: &LayerName,
name: &LayerFileName,
disk_consistent_lsn: Lsn,
) -> anyhow::Result<()> {
// future image layers are allowed to be produced always for not yet flushed to disk
@@ -231,14 +223,12 @@ pub(super) fn cleanup_future_layer(
}
pub(super) fn cleanup_local_only_file(
name: &LayerName,
local: &LocalLayerFileMetadata,
path: &Utf8Path,
name: &LayerFileName,
local: &LayerFileMetadata,
) -> anyhow::Result<()> {
let kind = name.kind();
tracing::info!(
"found local-only {kind} layer {name}, metadata {:?}",
local.metadata
);
std::fs::remove_file(&local.local_path)?;
tracing::info!("found local-only {kind} layer {name}, metadata {local:?}");
std::fs::remove_file(path)?;
Ok(())
}

View File

@@ -9,7 +9,6 @@ use utils::{
use crate::{
config::PageServerConf,
context::RequestContext,
metrics::TimelineMetrics,
tenant::{
layer_map::{BatchedUpdates, LayerMap},
@@ -70,7 +69,6 @@ impl LayerManager {
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
ctx: &RequestContext,
) -> Result<Arc<InMemoryLayer>> {
ensure!(lsn.is_aligned());
@@ -107,7 +105,7 @@ impl LayerManager {
);
let new_layer =
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
let layer = Arc::new(new_layer);
self.layer_map.open_layer = Some(layer.clone());
@@ -296,7 +294,7 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
self.0
.get(&desc.key())
.with_context(|| format!("get layer from desc: {}", desc.layer_name()))
.with_context(|| format!("get layer from desc: {}", desc.filename()))
.expect("not found")
.clone()
}

View File

@@ -1,14 +1,12 @@
use super::storage_layer::LayerName;
use super::storage_layer::LayerFileName;
use super::storage_layer::ResidentLayer;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::remote_timeline_client::index::Lineage;
use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;
use chrono::NaiveDateTime;
use pageserver_api::models::AuxFilePolicy;
use std::sync::Arc;
use tracing::info;
use utils::lsn::AtomicLsn;
@@ -47,7 +45,7 @@ pub(crate) struct UploadQueueInitialized {
/// All layer files stored in the remote storage, taking into account all
/// in-progress and queued operations
pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,
pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
/// How many file uploads or deletions been scheduled, since the
/// last (scheduling of) metadata index upload?
@@ -58,12 +56,6 @@ pub(crate) struct UploadQueueInitialized {
/// DANGER: do not return to outside world, e.g., safekeepers.
pub(crate) latest_metadata: TimelineMetadata,
/// Part of the flattened "next" `index_part.json`.
pub(crate) latest_lineage: Lineage,
/// The last aux file policy used on this timeline.
pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
/// `disk_consistent_lsn` from the last metadata file that was successfully
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -97,7 +89,7 @@ pub(crate) struct UploadQueueInitialized {
/// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
/// bug causing leaks, then it's better to not leave this enabled for production builds.
#[cfg(feature = "testing")]
pub(crate) dangling_files: HashMap<LayerName, Generation>,
pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
/// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
pub(crate) shutting_down: bool,
@@ -179,7 +171,6 @@ impl UploadQueue {
latest_files: HashMap::new(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: metadata.clone(),
latest_lineage: Lineage::default(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
// what follows are boring default initializations
@@ -193,7 +184,6 @@ impl UploadQueue {
dangling_files: HashMap::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
last_aux_file_policy: Default::default(),
};
*self = UploadQueue::Initialized(state);
@@ -228,7 +218,6 @@ impl UploadQueue {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part.metadata.clone(),
latest_lineage: index_part.lineage.clone(),
projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
visible_remote_consistent_lsn: Arc::new(
index_part.metadata.disk_consistent_lsn().into(),
@@ -244,7 +233,6 @@ impl UploadQueue {
dangling_files: HashMap::new(),
shutting_down: false,
shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
last_aux_file_policy: index_part.last_aux_file_policy(),
};
*self = UploadQueue::Initialized(state);
@@ -293,7 +281,7 @@ pub(crate) struct UploadTask {
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
#[derive(Debug)]
pub(crate) struct Delete {
pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>,
}
#[derive(Debug)]
@@ -302,7 +290,7 @@ pub(crate) enum UploadOp {
UploadLayer(ResidentLayer, LayerFileMetadata),
/// Upload the metadata file
UploadMetadata(Box<IndexPart>, Lsn),
UploadMetadata(IndexPart, Lsn),
/// Delete layer files
Delete(Delete),

View File

@@ -23,7 +23,6 @@ use pageserver_api::key::Key;
use utils::lsn::Lsn;
use utils::vec_map::VecMap;
use crate::context::RequestContext;
use crate::virtual_file::VirtualFile;
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -286,7 +285,6 @@ impl<'a> VectoredBlobReader<'a> {
&self,
read: &VectoredRead,
buf: BytesMut,
ctx: &RequestContext,
) -> Result<VectoredBlobsBuf, std::io::Error> {
assert!(read.size() > 0);
assert!(
@@ -297,7 +295,7 @@ impl<'a> VectoredBlobReader<'a> {
);
let buf = self
.file
.read_exact_at_n(buf, read.start, read.size(), ctx)
.read_exact_at_n(buf, read.start, read.size())
.await?;
let blobs_at = read.blobs_at.as_slice();

View File

@@ -344,23 +344,16 @@ macro_rules! with_file {
impl VirtualFile {
/// Open a file in read-only mode. Like File::open.
pub async fn open(
path: &Utf8Path,
ctx: &RequestContext,
) -> Result<VirtualFile, std::io::Error> {
Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
Self::open_with_options(path, OpenOptions::new().read(true)).await
}
/// Create a new file for writing. If the file exists, it will be truncated.
/// Like File::create.
pub async fn create(
path: &Utf8Path,
ctx: &RequestContext,
) -> Result<VirtualFile, std::io::Error> {
pub async fn create(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
Self::open_with_options(
path,
OpenOptions::new().write(true).create(true).truncate(true),
ctx,
)
.await
}
@@ -373,7 +366,6 @@ impl VirtualFile {
pub async fn open_with_options(
path: &Utf8Path,
open_options: &OpenOptions,
_ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
) -> Result<VirtualFile, std::io::Error> {
let path_str = path.to_string();
let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -584,34 +576,21 @@ impl VirtualFile {
Ok(self.pos)
}
pub async fn read_exact_at<B>(
&self,
buf: B,
offset: u64,
ctx: &RequestContext,
) -> Result<B, Error>
pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
where
B: IoBufMut + Send,
{
let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
self.read_at(buf, offset, ctx)
})
.await;
let (buf, res) =
read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
res.map(|()| buf)
}
pub async fn read_exact_at_n<B>(
&self,
buf: B,
offset: u64,
count: usize,
ctx: &RequestContext,
) -> Result<B, Error>
pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
where
B: IoBufMut + Send,
{
let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
self.read_at(buf, offset, ctx)
self.read_at(buf, offset)
})
.await;
res.map(|()| buf)
@@ -622,13 +601,12 @@ impl VirtualFile {
&self,
page: PageWriteGuard<'static>,
offset: u64,
ctx: &RequestContext,
) -> Result<PageWriteGuard<'static>, Error> {
let buf = PageWriteGuardBuf {
page,
init_up_to: 0,
};
let res = self.read_exact_at(buf, offset, ctx).await;
let res = self.read_exact_at(buf, offset).await;
res.map(|PageWriteGuardBuf { page, .. }| page)
.map_err(|e| Error::new(ErrorKind::Other, e))
}
@@ -721,12 +699,7 @@ impl VirtualFile {
(buf, Ok(n))
}
pub(crate) async fn read_at<B>(
&self,
buf: B,
offset: u64,
_ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
) -> (B, Result<usize, Error>)
pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
where
B: tokio_epoll_uring::BoundedBufMut + Send,
{
@@ -1047,21 +1020,20 @@ impl VirtualFile {
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
use crate::page_cache::PAGE_SZ;
let buf = vec![0; PAGE_SZ];
let buf = self
.read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
.read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
.await?;
Ok(crate::tenant::block_io::BlockLease::Vec(buf))
}
async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
let mut tmp = vec![0; 128];
loop {
let res;
(tmp, res) = self.read_at(tmp, self.pos, ctx).await;
(tmp, res) = self.read_at(tmp, self.pos).await;
match res {
Ok(0) => return Ok(()),
Ok(n) => {
@@ -1187,6 +1159,7 @@ mod tests {
use rand::seq::SliceRandom;
use rand::thread_rng;
use rand::Rng;
use std::future::Future;
use std::io::Write;
use std::os::unix::fs::FileExt;
use std::sync::Arc;
@@ -1203,14 +1176,9 @@ mod tests {
}
impl MaybeVirtualFile {
async fn read_exact_at(
&self,
mut buf: Vec<u8>,
offset: u64,
ctx: &RequestContext,
) -> Result<Vec<u8>, Error> {
async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
match self {
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
}
}
@@ -1262,13 +1230,13 @@ mod tests {
// Helper function to slurp contents of a file, starting at the current position,
// into a string
async fn read_string(&mut self, ctx: &RequestContext) -> Result<String, Error> {
async fn read_string(&mut self) -> Result<String, Error> {
use std::io::Read;
let mut buf = String::new();
match self {
MaybeVirtualFile::VirtualFile(file) => {
let mut buf = Vec::new();
file.read_to_end(&mut buf, ctx).await?;
file.read_to_end(&mut buf).await?;
return Ok(String::from_utf8(buf).unwrap());
}
MaybeVirtualFile::File(file) => {
@@ -1279,14 +1247,9 @@ mod tests {
}
// Helper function to slurp a portion of a file into a string
async fn read_string_at(
&mut self,
pos: u64,
len: usize,
ctx: &RequestContext,
) -> Result<String, Error> {
async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
let buf = vec![0; len];
let buf = self.read_exact_at(buf, pos, ctx).await?;
let buf = self.read_exact_at(buf, pos).await?;
Ok(String::from_utf8(buf).unwrap())
}
}
@@ -1300,101 +1263,73 @@ mod tests {
// results with VirtualFiles as with native Files. (Except that with
// native files, you will run out of file descriptors if the ulimit
// is low enough.)
struct A;
impl Adapter for A {
async fn open(
path: Utf8PathBuf,
opts: OpenOptions,
ctx: &RequestContext,
) -> Result<MaybeVirtualFile, anyhow::Error> {
let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?;
Ok(MaybeVirtualFile::VirtualFile(vf))
}
}
test_files::<A>("virtual_files").await
test_files("virtual_files", |path, open_options| async move {
let vf = VirtualFile::open_with_options(&path, &open_options).await?;
Ok(MaybeVirtualFile::VirtualFile(vf))
})
.await
}
#[tokio::test]
async fn test_physical_files() -> anyhow::Result<()> {
struct B;
impl Adapter for B {
async fn open(
path: Utf8PathBuf,
opts: OpenOptions,
_ctx: &RequestContext,
) -> Result<MaybeVirtualFile, anyhow::Error> {
Ok(MaybeVirtualFile::File({
let owned_fd = opts.open(path.as_std_path()).await?;
File::from(owned_fd)
}))
}
}
test_files::<B>("physical_files").await
test_files("physical_files", |path, open_options| async move {
Ok(MaybeVirtualFile::File({
let owned_fd = open_options.open(path.as_std_path()).await?;
File::from(owned_fd)
}))
})
.await
}
/// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition
/// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function
/// in trait which benefits from the new lifetime capture rules already.
trait Adapter {
async fn open(
path: Utf8PathBuf,
opts: OpenOptions,
ctx: &RequestContext,
) -> Result<MaybeVirtualFile, anyhow::Error>;
}
async fn test_files<A>(testname: &str) -> anyhow::Result<()>
async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
where
A: Adapter,
OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
{
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let testdir = crate::config::PageServerConf::test_repo_dir(testname);
std::fs::create_dir_all(&testdir)?;
let path_a = testdir.join("file_a");
let mut file_a = A::open(
let mut file_a = openfunc(
path_a.clone(),
OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.to_owned(),
&ctx,
)
.await?;
file_a.write_all(b"foobar".to_vec(), &ctx).await?;
// cannot read from a file opened in write-only mode
let _ = file_a.read_string(&ctx).await.unwrap_err();
let _ = file_a.read_string().await.unwrap_err();
// Close the file and re-open for reading
let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
// cannot write to a file opened in read-only mode
let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
// Try simple read
assert_eq!("foobar", file_a.read_string(&ctx).await?);
assert_eq!("foobar", file_a.read_string().await?);
// It's positioned at the EOF now.
assert_eq!("", file_a.read_string(&ctx).await?);
assert_eq!("", file_a.read_string().await?);
// Test seeks.
assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
assert_eq!("oobar", file_a.read_string(&ctx).await?);
assert_eq!("oobar", file_a.read_string().await?);
assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4);
assert_eq!("ar", file_a.read_string(&ctx).await?);
assert_eq!("ar", file_a.read_string().await?);
assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3);
assert_eq!("bar", file_a.read_string(&ctx).await?);
assert_eq!("bar", file_a.read_string().await?);
assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1);
assert_eq!("oobar", file_a.read_string(&ctx).await?);
assert_eq!("oobar", file_a.read_string().await?);
// Test erroneous seeks to before byte 0
file_a.seek(SeekFrom::End(-7)).await.unwrap_err();
@@ -1402,11 +1337,11 @@ mod tests {
file_a.seek(SeekFrom::Current(-2)).await.unwrap_err();
// the erroneous seek should have left the position unchanged
assert_eq!("oobar", file_a.read_string(&ctx).await?);
assert_eq!("oobar", file_a.read_string().await?);
// Create another test file, and try FileExt functions on it.
let path_b = testdir.join("file_b");
let mut file_b = A::open(
let mut file_b = openfunc(
path_b.clone(),
OpenOptions::new()
.read(true)
@@ -1414,13 +1349,12 @@ mod tests {
.create(true)
.truncate(true)
.to_owned(),
&ctx,
)
.await?;
file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
// Open a lot of files, enough to cause some evictions. (Or to be precise,
// open the same file many times. The effect is the same.)
@@ -1430,13 +1364,9 @@ mod tests {
let mut vfiles = Vec::new();
for _ in 0..100 {
let mut vfile = A::open(
path_b.clone(),
OpenOptions::new().read(true).to_owned(),
&ctx,
)
.await?;
assert_eq!("FOOBAR", vfile.read_string(&ctx).await?);
let mut vfile =
openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
assert_eq!("FOOBAR", vfile.read_string().await?);
vfiles.push(vfile);
}
@@ -1445,13 +1375,13 @@ mod tests {
// The underlying file descriptor for 'file_a' should be closed now. Try to read
// from it again. We left the file positioned at offset 1 above.
assert_eq!("oobar", file_a.read_string(&ctx).await?);
assert_eq!("oobar", file_a.read_string().await?);
// Check that all the other FDs still work too. Use them in random order for
// good measure.
vfiles.as_mut_slice().shuffle(&mut thread_rng());
for vfile in vfiles.iter_mut() {
assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?);
assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?);
}
Ok(())
@@ -1467,7 +1397,6 @@ mod tests {
const THREADS: usize = 100;
const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
std::fs::create_dir_all(&testdir)?;
@@ -1481,12 +1410,8 @@ mod tests {
// Open the file many times.
let mut files = Vec::new();
for _ in 0..VIRTUAL_FILES {
let f = VirtualFile::open_with_options(
&test_file_path,
OpenOptions::new().read(true),
&ctx,
)
.await?;
let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))
.await?;
files.push(f);
}
let files = Arc::new(files);
@@ -1500,13 +1425,12 @@ mod tests {
let mut hdls = Vec::new();
for _threadno in 0..THREADS {
let files = files.clone();
let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
let hdl = rt.spawn(async move {
let mut buf = vec![0u8; SIZE];
let mut rng = rand::rngs::OsRng;
for _ in 1..1000 {
let f = &files[rng.gen_range(0..files.len())];
buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
buf = f.read_exact_at(buf, 0).await.unwrap();
assert!(buf == SAMPLE);
}
});
@@ -1522,7 +1446,6 @@ mod tests {
#[tokio::test]
async fn test_atomic_overwrite_basic() {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
std::fs::create_dir_all(&testdir).unwrap();
@@ -1532,8 +1455,8 @@ mod tests {
VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
let post = file.read_string(&ctx).await.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "foo");
assert!(!tmp_path.exists());
drop(file);
@@ -1541,8 +1464,8 @@ mod tests {
VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
let post = file.read_string(&ctx).await.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "bar");
assert!(!tmp_path.exists());
drop(file);
@@ -1550,7 +1473,6 @@ mod tests {
#[tokio::test]
async fn test_atomic_overwrite_preexisting_tmp() {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let testdir =
crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
std::fs::create_dir_all(&testdir).unwrap();
@@ -1565,8 +1487,8 @@ mod tests {
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
let post = file.read_string(&ctx).await.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "foo");
assert!(!tmp_path.exists());
drop(file);

View File

@@ -153,7 +153,10 @@ impl PostgresRedoManager {
process: self
.redo_process
.get()
.map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
.map(|p| WalRedoManagerProcessStatus {
pid: p.id(),
kind: std::borrow::Cow::Borrowed(p.kind().into()),
}),
}
}
}

View File

@@ -1,10 +1,7 @@
/// Layer of indirection previously used to support multiple implementations.
/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
use std::time::Duration;
use bytes::Bytes;
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use tracing::warn;
use utils::lsn::Lsn;
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
@@ -15,6 +12,7 @@ mod protocol;
mod process_impl {
pub(super) mod process_async;
pub(super) mod process_std;
}
#[derive(
@@ -36,7 +34,10 @@ pub enum Kind {
Async,
}
pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
pub(crate) enum Process {
Sync(process_impl::process_std::WalRedoProcess),
Async(process_impl::process_async::WalRedoProcess),
}
impl Process {
#[inline(always)]
@@ -45,17 +46,18 @@ impl Process {
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
if conf.walredo_process_kind != Kind::Async {
warn!(
configured = %conf.walredo_process_kind,
"the walredo_process_kind setting has been turned into a no-op, using async implementation"
);
}
Ok(Self(process_impl::process_async::WalRedoProcess::launch(
conf,
tenant_shard_id,
pg_version,
)?))
Ok(match conf.walredo_process_kind {
Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
conf,
tenant_shard_id,
pg_version,
)?),
Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
conf,
tenant_shard_id,
pg_version,
)?),
})
}
#[inline(always)]
@@ -67,12 +69,29 @@ impl Process {
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
self.0
.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await
match self {
Process::Sync(p) => {
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await
}
Process::Async(p) => {
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
.await
}
}
}
pub(crate) fn id(&self) -> u32 {
self.0.id()
match self {
Process::Sync(p) => p.id(),
Process::Async(p) => p.id(),
}
}
pub(crate) fn kind(&self) -> Kind {
match self {
Process::Sync(_) => Kind::Sync,
Process::Async(_) => Kind::Async,
}
}
}

View File

@@ -0,0 +1,405 @@
use self::no_leak_child::NoLeakChild;
use crate::{
config::PageServerConf,
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
walrecord::NeonWalRecord,
walredo::process::{no_leak_child, protocol},
};
use anyhow::Context;
use bytes::Bytes;
use nix::poll::{PollFd, PollFlags};
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
use postgres_ffi::BLCKSZ;
use std::os::fd::AsRawFd;
#[cfg(feature = "testing")]
use std::sync::atomic::AtomicUsize;
use std::{
collections::VecDeque,
io::{Read, Write},
process::{ChildStdin, ChildStdout, Command, Stdio},
sync::{Mutex, MutexGuard},
time::Duration,
};
use tracing::{debug, error, instrument, Instrument};
use utils::{lsn::Lsn, nonblock::set_nonblock};
pub struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: Mutex<ProcessOutput>,
stdin: Mutex<ProcessInput>,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
}
struct ProcessInput {
stdin: ChildStdin,
n_requests: usize,
}
struct ProcessOutput {
stdout: ChildStdout,
pending_responses: VecDeque<Option<Bytes>>,
n_processed_responses: usize,
}
impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(pg_version=pg_version))]
pub(crate) fn launch(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
crate::span::debug_assert_current_span_has_tenant_id();
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
use no_leak_child::NoLeakChildCommandExt;
// Start postgres itself
let child = Command::new(pg_bin_dir_path.join("postgres"))
// the first arg must be --wal-redo so the child process enters into walredo mode
.arg("--wal-redo")
// the child doesn't process this arg, but, having it in the argv helps indentify the
// walredo process for a particular tenant when debugging a pagserver
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
.stdin(Stdio::piped())
.stderr(Stdio::piped())
.stdout(Stdio::piped())
.env_clear()
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
// NB: The redo process is not trusted after we sent it the first
// walredo work. Before that, it is trusted. Specifically, we trust
// it to
// 1. close all file descriptors except stdin, stdout, stderr because
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
// the files it opens, and
// 2. to use seccomp to sandbox itself before processing the first
// walredo request.
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait(WalRedoKillCause::Startup);
});
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stderr = tokio::process::ChildStderr::from_std(stderr)
.context("convert to tokio::ChildStderr")?;
macro_rules! set_nonblock_or_log_err {
($file:ident) => {{
let res = set_nonblock($file.as_raw_fd());
if let Err(e) = &res {
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
}
res
}};
}
set_nonblock_or_log_err!(stdin)?;
set_nonblock_or_log_err!(stdout)?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
tokio::spawn(
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
let mut buf = Vec::new();
let res = loop {
buf.clear();
// TODO we don't trust the process to cap its stderr length.
// Currently it can do unbounded Vec allocation.
match stderr_lines.read_until(b'\n', &mut buf).await {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
}
Err(e) => {
break Err(e);
}
}
};
match res {
Ok(()) => (),
Err(e) => {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
);
Ok(Self {
conf,
tenant_shard_id,
child: Some(child),
stdin: Mutex::new(ProcessInput {
stdin,
n_requests: 0,
}),
stdout: Mutex::new(ProcessOutput {
stdout,
pending_responses: VecDeque::new(),
n_processed_responses: 0,
}),
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
})
}
pub(crate) fn id(&self) -> u32 {
self.child
.as_ref()
.expect("must not call this during Drop")
.id()
}
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
pub(crate) async fn apply_wal_records(
&self,
rel: RelTag,
blknum: u32,
base_img: &Option<Bytes>,
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let tag = protocol::BufferTag { rel, blknum };
let input = self.stdin.lock().unwrap();
// Serialize all the messages to send the WAL redo process first.
//
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
protocol::build_push_page_msg(tag, img, &mut writebuf);
}
for (lsn, rec) in records.iter() {
if let NeonWalRecord::Postgres {
will_init: _,
rec: postgres_rec,
} = rec
{
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
} else {
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
}
}
protocol::build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
if res.is_err() {
// not all of these can be caused by this particular input, however these are so rare
// in tests so capture all.
self.record_and_log(&writebuf);
}
res
}
fn apply_wal_records0(
&self,
writebuf: &[u8],
input: MutexGuard<ProcessInput>,
wal_redo_timeout: Duration,
) -> anyhow::Result<Bytes> {
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
let mut nwrite = 0usize;
while nwrite < writebuf.len() {
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
let n = loop {
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
Err(nix::errno::Errno::EINTR) => continue,
res => break res,
}
}?;
if n == 0 {
anyhow::bail!("WAL redo timed out");
}
// If 'stdin' is writeable, do write.
let in_revents = stdin_pollfds[0].revents().unwrap();
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
}
if in_revents.contains(PollFlags::POLLHUP) {
// We still have more data to write, but the process closed the pipe.
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
}
}
let request_no = proc.n_requests;
proc.n_requests += 1;
drop(proc);
// To improve walredo performance we separate sending requests and receiving
// responses. Them are protected by different mutexes (output and input).
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
// then there is not warranty that T1 will first granted output mutex lock.
// To address this issue we maintain number of sent requests, number of processed
// responses and ring buffer with pending responses. After sending response
// (under input mutex), threads remembers request number. Then it releases
// input mutex, locks output mutex and fetch in ring buffer all responses until
// its stored request number. The it takes correspondent element from
// pending responses ring buffer and truncate all empty elements from the front,
// advancing processed responses number.
let mut output = self.stdout.lock().unwrap();
let n_processed_responses = output.n_processed_responses;
while n_processed_responses + output.pending_responses.len() <= request_no {
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; BLCKSZ.into()];
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
while nresult < BLCKSZ.into() {
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
// We do two things simultaneously: reading response from stdout
// and forward any logging information that the child writes to its stderr to the page server's log.
let n = loop {
match nix::poll::poll(
&mut stdout_pollfds[..],
wal_redo_timeout.as_millis() as i32,
) {
Err(nix::errno::Errno::EINTR) => continue,
res => break res,
}
}?;
if n == 0 {
anyhow::bail!("WAL redo timed out");
}
// If we have some data in stdout, read it to the result buffer.
let out_revents = stdout_pollfds[0].revents().unwrap();
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
}
if out_revents.contains(PollFlags::POLLHUP) {
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
}
}
output
.pending_responses
.push_back(Some(Bytes::from(resultbuf)));
}
// Replace our request's response with None in `pending_responses`.
// Then make space in the ring buffer by clearing out any seqence of contiguous
// `None`'s from the front of `pending_responses`.
// NB: We can't pop_front() because other requests' responses because another
// requester might have grabbed the output mutex before us:
// T1: grab input mutex
// T1: send request_no 23
// T1: release input mutex
// T2: grab input mutex
// T2: send request_no 24
// T2: release input mutex
// T2: grab output mutex
// T2: n_processed_responses + output.pending_responses.len() <= request_no
// 23 0 24
// T2: enters poll loop that reads stdout
// T2: put response for 23 into pending_responses
// T2: put response for 24 into pending_resposnes
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
// T2: takes its response_24
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Some(response_23) None Back
// T2: releases output mutex
// T1: grabs output mutex
// T1: n_processed_responses + output.pending_responses.len() > request_no
// 23 2 23
// T1: skips poll loop that reads stdout
// T1: takes its response_23
// pending_responses now looks like this: Front None None Back
// T2: does the while loop below
// pending_responses now looks like this: Front Back
// n_processed_responses now has value 25
let res = output.pending_responses[request_no - n_processed_responses]
.take()
.expect("we own this request_no, nobody else is supposed to take it");
while let Some(front) = output.pending_responses.front() {
if front.is_none() {
output.pending_responses.pop_front();
output.n_processed_responses += 1;
} else {
break;
}
}
Ok(res)
}
#[cfg(feature = "testing")]
fn record_and_log(&self, writebuf: &[u8]) {
use std::sync::atomic::Ordering;
let millis = std::time::SystemTime::now()
.duration_since(std::time::SystemTime::UNIX_EPOCH)
.unwrap()
.as_millis();
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
.create_new(true)
.read(true)
.open(path)
.and_then(|mut f| f.write_all(writebuf));
// trip up allowed_errors
if let Err(e) = res {
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
} else {
tracing::error!(filename, "erroring walredo input saved");
}
}
#[cfg(not(feature = "testing"))]
fn record_and_log(&self, _: &[u8]) {}
}
impl Drop for WalRedoProcess {
fn drop(&mut self) {
self.child
.take()
.expect("we only do this once")
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
// no way to wait for stderr_logger_task from Drop because that is async only
}
}

View File

@@ -49,7 +49,7 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
int neon_protocol_version = 1;
int neon_protocol_version = 2;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
1, /* default to old protocol for now */
2, /* use protocol version 2 */
1, /* min */
2, /* max */
PGC_SU_BACKEND,

View File

@@ -237,50 +237,18 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
/*
* LSN values associated with each request to the pageserver
*/
typedef struct
{
/*
* 'request_lsn' is the main value that determines which page version to
* fetch.
*/
XLogRecPtr request_lsn;
/*
* A hint to the pageserver that the requested page hasn't been modified
* between this LSN and 'request_lsn'. That allows the pageserver to
* return the page faster, without waiting for 'request_lsn' to arrive in
* the pageserver, as long as 'not_modified_since' has arrived.
*/
XLogRecPtr not_modified_since;
/*
* 'effective_request_lsn' is not included in the request that's sent to
* the pageserver, but is used to keep track of the latest LSN of when the
* request was made. In a standby server, this is always the same as the
* 'request_lsn', but in the primary we use UINT64_MAX as the
* 'request_lsn' to request the latest page version, so we need this
* separate field to remember that latest LSN was when the request was
* made. It's needed to manage prefetch request, to verify if the response
* to a prefetched request is still valid.
*/
XLogRecPtr effective_request_lsn;
} neon_request_lsns;
#if PG_MAJORVERSION_NUM < 16
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
neon_request_lsns request_lsns, char *buffer);
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
#else
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void *buffer);
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
neon_request_lsns request_lsns, void *buffer);
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void *buffer, bool skipFsync);
#endif

View File

@@ -168,7 +168,8 @@ typedef enum PrefetchStatus
typedef struct PrefetchRequest
{
BufferTag buftag; /* must be first entry in the struct */
neon_request_lsns request_lsns;
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
NeonResponse *response; /* may be null */
PrefetchStatus status;
shardno_t shard_no;
@@ -270,15 +271,16 @@ static PrefetchState *MyPState;
static bool compact_prefetch_buffers(void);
static void consume_prefetch_responses(void);
static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
static bool prefetch_read(PrefetchRequest *slot);
static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
static bool prefetch_wait_for(uint64 ring_index);
static void prefetch_cleanup_trailing_unused(void);
static inline void prefetch_set_unused(uint64 ring_index);
static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno);
static bool neon_prefetch_response_usable(neon_request_lsns request_lsns,
static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
PrefetchRequest *slot);
static bool
@@ -336,7 +338,8 @@ compact_prefetch_buffers(void)
target_slot->shard_no = source_slot->shard_no;
target_slot->status = source_slot->status;
target_slot->response = source_slot->response;
target_slot->request_lsns = source_slot->request_lsns;
target_slot->request_lsn = source_slot->request_lsn;
target_slot->not_modified_since = source_slot->not_modified_since;
target_slot->my_ring_index = empty_ring_index;
prfh_delete(MyPState->prf_hash, source_slot);
@@ -355,9 +358,8 @@ compact_prefetch_buffers(void)
};
source_slot->response = NULL;
source_slot->my_ring_index = 0;
source_slot->request_lsns = (neon_request_lsns) {
InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
};
source_slot->request_lsn = InvalidXLogRecPtr;
source_slot->not_modified_since = InvalidXLogRecPtr;
/* update bookkeeping */
n_moved++;
@@ -687,7 +689,7 @@ prefetch_set_unused(uint64 ring_index)
* prefetch_wait_for().
*/
static void
prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
{
bool found;
NeonGetPageRequest request = {
@@ -698,14 +700,23 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
.blkno = slot->buftag.blockNum,
};
if (force_request_lsns)
slot->request_lsns = *force_request_lsns;
Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
if (force_request_lsn)
{
request.req.lsn = *force_request_lsn;
request.req.not_modified_since = *force_not_modified_since;
}
else
slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
slot->buftag.forkNum,
slot->buftag.blockNum);
request.req.lsn = slot->request_lsns.request_lsn;
request.req.not_modified_since = slot->request_lsns.not_modified_since;
{
neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
slot->buftag.forkNum,
slot->buftag.blockNum,
&request.req.lsn,
&request.req.not_modified_since);
}
slot->request_lsn = request.req.lsn;
slot->not_modified_since = request.req.not_modified_since;
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -731,22 +742,25 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
*
* Register that we may want the contents of BufferTag in the near future.
*
* If force_request_lsns is not NULL, those values are sent to the
* pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
* to calculate the LSNs to send.
* If force_request_lsn and force_not_modified_since are not NULL, those
* values are sent to the pageserver. If they are NULL, we utilize the
* lastWrittenLsn -infrastructure to fill them in.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static uint64
prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
XLogRecPtr *force_not_modified_since)
{
uint64 ring_index;
PrefetchRequest req;
PrefetchRequest *slot;
PrfHashEntry *entry;
Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
req.buftag = tag;
Retry:
@@ -767,9 +781,10 @@ Retry:
* If the caller specified a request LSN to use, only accept prefetch
* responses that satisfy that request.
*/
if (force_request_lsns)
if (force_request_lsn)
{
if (!neon_prefetch_response_usable(*force_request_lsns, slot))
if (!neon_prefetch_response_usable(*force_request_lsn,
*force_not_modified_since, slot))
{
/* Wait for the old request to finish and discard it */
if (!prefetch_wait_for(ring_index))
@@ -871,7 +886,7 @@ Retry:
slot->shard_no = get_shard_number(&tag);
slot->my_ring_index = ring_index;
prefetch_do_request(slot, force_request_lsns);
prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
Assert(slot->status == PRFS_REQUESTED);
Assert(MyPState->ring_last <= ring_index &&
ring_index < MyPState->ring_unused);
@@ -1514,11 +1529,11 @@ nm_adjust_lsn(XLogRecPtr lsn)
/*
* Return LSN for requesting pages and number of blocks from page server
*/
static neon_request_lsns
neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
static void
neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
{
XLogRecPtr last_written_lsn;
neon_request_lsns result;
last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
last_written_lsn = nm_adjust_lsn(last_written_lsn);
@@ -1527,13 +1542,12 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
if (RecoveryInProgress())
{
/* Request the page at the last replayed LSN. */
result.request_lsn = GetXLogReplayRecPtr(NULL);
result.not_modified_since = last_written_lsn;
result.effective_request_lsn = result.request_lsn;
Assert(last_written_lsn <= result.request_lsn);
*request_lsn = GetXLogReplayRecPtr(NULL);
*not_modified_since = last_written_lsn;
Assert(last_written_lsn <= *request_lsn);
neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since));
neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
}
else
{
@@ -1545,7 +1559,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
* must still in the buffer cache, so our request cannot concern
* those.
*/
neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
LSN_FORMAT_ARGS(last_written_lsn));
/*
@@ -1571,33 +1585,16 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
}
/*
* Request the very latest version of the page. In principle we
* want to read the page at the current insert LSN, and we could
* use that value in the request. However, there's a corner case
* with pageserver's garbage collection. If the GC horizon is
* set to a very small value, it's possible that by the time
* that the pageserver processes our request, the GC horizon has
* already moved past the LSN we calculate here. Standby servers
* always have that problem as the can always lag behind the
* primary, but for the primary we can avoid it by always
* requesting the latest page, by setting request LSN to
* UINT64_MAX.
*
* Remember the current LSN, however, so that we can later
* correctly determine if the response to the request is still
* valid. The most up-to-date LSN we could use for that purpose
* would be the current insert LSN, but to avoid the overhead of
* looking it up, use 'flushlsn' instead. This relies on the
* assumption that if the page was modified since the last WAL
* flush, it should still be in the buffer cache, and we
* wouldn't be requesting it.
* Request the latest version of the page. The most up-to-date request
* LSN we could use would be the current insert LSN, but to avoid the
* overhead of looking it up, use 'flushlsn' instead. This relies on
* the assumption that if the page was modified since the last WAL
* flush, it should still be in the buffer cache, and we wouldn't be
* requesting it.
*/
result.request_lsn = UINT64_MAX;
result.not_modified_since = last_written_lsn;
result.effective_request_lsn = flushlsn;
*request_lsn = flushlsn;
*not_modified_since = last_written_lsn;
}
return result;
}
/*
@@ -1607,16 +1604,12 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
* satisfy a page read now.
*/
static bool
neon_prefetch_response_usable(neon_request_lsns request_lsns,
neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
PrefetchRequest *slot)
{
/* sanity check the LSN's on the old and the new request */
Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since);
Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn);
Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
Assert(request_lsn >= not_modified_since);
Assert(slot->request_lsn >= slot->not_modified_since);
Assert(slot->status != PRFS_UNUSED);
/*
@@ -1634,40 +1627,26 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
* calculate LSNs "out of order" with each other, but the prefetch queue
* is backend-private at the moment.)
*/
if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn ||
request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
{
ereport(LOG,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
LSN_FORMAT_ARGS(request_lsns.not_modified_since),
LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
return false;
}
/*---
* Each request to the pageserver has three LSN values associated with it:
* `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
* `not_modified_since` and `request_lsn` are sent to the pageserver, but
* in the primary node, we always use UINT64_MAX as the `request_lsn`, so
* we remember `effective_request_lsn` separately. In a primary,
* `effective_request_lsn` is the last flush WAL position when the request
* was sent to the pageserver. That's logically the LSN that we are
* requesting the page at, but we send UINT64_MAX to the pageserver so
* that if the GC horizon advances past that position, we still get a
* valid response instead of an error.
*
* To determine whether a response to a GetPage request issued earlier is
* still valid to satisfy a new page read, we look at the
* (not_modified_since, effective_request_lsn] range of the request. It is
* effectively a claim that the page has not been modified between those
* LSNs. If the range of the old request in the queue overlaps with the
* new request, we know that the page hasn't been modified in the union of
* the ranges. We can use the response to old request to satisfy the new
* request in that case. For example:
* Each request to the pageserver carries two LSN values:
* `not_modified_since` and `request_lsn`. The (not_modified_since,
* request_lsn] range of each request is effectively a claim that the page
* has not been modified between those LSNs. If the range of the old
* request in the queue overlaps with the new request, we know that the
* page hasn't been modified in the union of the ranges. We can use the
* response to old request to satisfy the new request in that case. For
* example:
*
* 100 500
* Old request: +--------+
@@ -1696,9 +1675,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
*/
/* this follows from the checks above */
Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
Assert(request_lsn >= slot->not_modified_since);
return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn;
return not_modified_since <= slot->request_lsn;
}
/*
@@ -1710,7 +1689,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
bool exists;
NeonResponse *resp;
BlockNumber n_blocks;
neon_request_lsns request_lsns;
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
switch (reln->smgr_relpersistence)
{
@@ -1765,15 +1745,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return false;
}
request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
&request_lsn, &not_modified_since);
{
NeonExistsRequest request = {
.req.tag = T_NeonExistsRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.rinfo = InfoFromSMgrRel(reln),
.forknum = forkNum
};
.forknum = forkNum};
resp = page_server_request(&request);
}
@@ -1790,7 +1770,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
(uint32) (request_lsn >> 32), (uint32) request_lsn),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
@@ -2155,7 +2135,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
ring_index = prefetch_register_buffer(tag, NULL);
ring_index = prefetch_register_buffer(tag, NULL, NULL);
Assert(ring_index < MyPState->ring_unused &&
MyPState->ring_last <= ring_index);
@@ -2208,10 +2188,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
void
#if PG_MAJORVERSION_NUM < 16
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
neon_request_lsns request_lsns, char *buffer)
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
#else
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
neon_request_lsns request_lsns, void *buffer)
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
#endif
{
NeonResponse *resp;
@@ -2243,7 +2223,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
* value of the LwLsn cache when the entry is not found.
*/
if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
XLogWaitForReplayOf(request_lsns.request_lsn);
XLogWaitForReplayOf(request_lsn);
/*
* Try to find prefetched page in the list of received pages.
@@ -2254,7 +2234,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
if (entry != NULL)
{
slot = entry->slot;
if (neon_prefetch_response_usable(request_lsns, slot))
if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
{
ring_index = slot->my_ring_index;
pgBufferUsage.prefetch.hits += 1;
@@ -2288,7 +2268,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
{
pgBufferUsage.prefetch.misses += 1;
ring_index = prefetch_register_buffer(buftag, &request_lsns);
ring_index = prefetch_register_buffer(buftag, &request_lsn,
&not_modified_since);
slot = GetPrfSlot(ring_index);
}
else
@@ -2329,7 +2310,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
slot->shard_no, blkno,
RelFileInfoFmt(rinfo),
forkNum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
(uint32) (request_lsn >> 32), (uint32) request_lsn),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
@@ -2352,7 +2333,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
#endif
{
neon_request_lsns request_lsns;
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
switch (reln->smgr_relpersistence)
{
@@ -2377,8 +2359,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
return;
}
request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
&request_lsn, &not_modified_since);
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
#ifdef DEBUG_COMPARE_LOCAL
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2547,7 +2530,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
{
NeonResponse *resp;
BlockNumber n_blocks;
neon_request_lsns request_lsns;
XLogRecPtr request_lsn;
XLogRecPtr not_modified_since;
switch (reln->smgr_relpersistence)
{
@@ -2574,12 +2558,13 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
return n_blocks;
}
request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
&request_lsn, &not_modified_since);
{
NeonNblocksRequest request = {
.req.tag = T_NeonNblocksRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.rinfo = InfoFromSMgrRel(reln),
.forknum = forknum,
};
@@ -2599,7 +2584,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
(uint32) (request_lsn >> 32), (uint32) request_lsn),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
@@ -2610,10 +2595,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
n_blocks);
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum,
(uint32) (request_lsn >> 32), (uint32) request_lsn,
n_blocks);
pfree(resp);
return n_blocks;
@@ -2627,15 +2612,17 @@ neon_dbsize(Oid dbNode)
{
NeonResponse *resp;
int64 db_size;
neon_request_lsns request_lsns;
XLogRecPtr request_lsn,
not_modified_since;
NRelFileInfo dummy_node = {0};
request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
&request_lsn, &not_modified_since);
{
NeonDbSizeRequest request = {
.req.tag = T_NeonDbSizeRequest,
.req.lsn = request_lsns.request_lsn,
.req.not_modified_since = request_lsns.not_modified_since,
.req.lsn = request_lsn,
.req.not_modified_since = not_modified_since,
.dbNode = dbNode,
};
@@ -2652,7 +2639,8 @@ neon_dbsize(Oid dbNode)
ereport(ERROR,
(errcode(ERRCODE_IO_ERROR),
errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
dbNode,
(uint32) (request_lsn >> 32), (uint32) request_lsn),
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
@@ -2662,7 +2650,9 @@ neon_dbsize(Oid dbNode)
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
dbNode,
(uint32) (request_lsn >> 32), (uint32) request_lsn,
db_size);
pfree(resp);
return db_size;
@@ -2907,10 +2897,6 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
XLogRecPtr request_lsn,
not_modified_since;
/*
* Compute a request LSN to use, similar to neon_get_request_lsns() but the
* logic is a bit simpler.
*/
if (RecoveryInProgress())
{
request_lsn = GetXLogReplayRecPtr(NULL);
@@ -2922,10 +2908,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
*/
request_lsn = GetRedoStartLsn();
}
request_lsn = nm_adjust_lsn(request_lsn);
}
else
request_lsn = UINT64_MAX;
request_lsn = GetXLogInsertRecPtr();
request_lsn = nm_adjust_lsn(request_lsn);
/*
* GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU

Some files were not shown because too many files have changed in this diff Show More