Compare commits

..

14 Commits

Author SHA1 Message Date
Arpad Müller
e97e86eb43 poetry lock 2024-05-06 12:58:34 +02:00
Arpad Müller
c9370d48de Merge remote-tracking branch 'origin/main' into arpad/less_async_trait 2024-05-06 12:54:51 +02:00
Arpad Müller
6f714c308b Adjust timeouts 2024-04-08 17:02:40 +02:00
Arpad Müller
2e6afaa642 Merge remote-tracking branch 'origin/main' into arpad/less_async_trait 2024-04-08 16:58:47 +02:00
Alexander Bayandin
8f0a0440ba CI: reduce session timeout to 30 minutes 2024-04-08 12:29:24 +01:00
Alexander Bayandin
987dc01ed7 CI: set fix timeout value in seconds for regression tests 2024-04-05 15:05:07 +01:00
Alexander Bayandin
719e4ad580 Bump pytest-timeout from 2.1.0 to 2.3.1 2024-04-05 14:58:58 +01:00
Alexander Bayandin
e61b2a08b3 CI: set pytest timeout for regression test suite 2024-04-05 12:54:47 +01:00
Arpad Müller
cc89b46ae5 Merge branch 'main' into arpad/less_async_trait 2024-04-04 16:30:09 +02:00
Arpad Müller
d5cbdd2e90 Remove it here as well 2024-04-04 12:36:28 +02:00
Arpad Müller
6ad9c3560e Merge branch 'main' into arpad/less_async_trait 2024-04-04 12:27:38 +02:00
Arpad Müller
9dc3b09e57 Remove async-trait from Cargo.toml of crates it became unused in 2024-04-03 23:25:06 +02:00
Arpad Müller
fe762e35d8 Remove async_trait from Handler trait as well 2024-04-03 23:21:16 +02:00
Arpad Müller
0c4988a92c Remove async_trait from CompactionDeltaLayer 2024-04-03 23:21:16 +02:00
148 changed files with 3421 additions and 6769 deletions

View File

@@ -2,7 +2,6 @@
# This is only present for local builds, as it will be overridden
# by the RUSTDOCFLAGS env var in CI.
rustdocflags = ["-Arustdoc::private_intra_doc_links"]
rustflags = ["--cfg=tokio_unstable"]
[alias]
build_testing = ["build", "--features", "testing"]

View File

@@ -1,2 +1,2 @@
[profile.default]
slow-timeout = { period = "60s", terminate-after = 3 }
slow-timeout = { period = "20s", terminate-after = 3 }

View File

@@ -1,9 +1,11 @@
self-hosted-runner:
labels:
- arm64
- dev
- gen3
- large
- large-arm64
# Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
- macos-14
- small
- us-east-2
config-variables:

View File

@@ -48,6 +48,10 @@ inputs:
description: 'benchmark durations JSON'
required: false
default: '{}'
session_timeout:
description: 'Session timeout for the test suite'
required: false
default: ''
runs:
using: "composite"
@@ -107,6 +111,7 @@ runs:
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
RERUN_FLAKY: ${{ inputs.rerun_flaky }}
PG_VERSION: ${{ inputs.pg_version }}
SESSION_TIMEOUT: ${{ inputs.session_timeout }}
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report
@@ -168,6 +173,10 @@ runs:
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
fi
if [ -n "${SESSION_TIMEOUT}" ]; then
EXTRA_PARAMS="--session-timeout ${SESSION_TIMEOUT} ${EXTRA_PARAMS}"
fi
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
elif [[ "${{ inputs.build_type }}" == "release" ]]; then

View File

@@ -39,7 +39,7 @@ jobs:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
env:
IMAGE_TAG: ${{ inputs.image-tag }}

View File

@@ -236,6 +236,27 @@ jobs:
submodules: true
fetch-depth: 1
- name: Check Postgres submodules revision
shell: bash -euo pipefail {0}
run: |
# This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
# Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
FAILED=false
for postgres in postgres-v14 postgres-v15 postgres-v16; do
expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
actual=$(git rev-parse "HEAD:vendor/${postgres}")
if [ "${expected}" != "${actual}" ]; then
echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
FAILED=true
fi
done
if [ "${FAILED}" = "true" ]; then
echo >&2 "Please update vendor/revisions.json if these changes are intentional"
exit 1
fi
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
@@ -341,9 +362,6 @@ jobs:
env:
NEXTEST_RETRIES: 3
run: |
#nextest does not yet support running doctests
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
@@ -443,7 +461,8 @@ jobs:
- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set
timeout-minutes: 60
# Hard timeout to prevent hanging tests, we also have set softer pytest timeout (set via `session_timeout`) which is shorter
timeout-minutes: 110
with:
build_type: ${{ matrix.build_type }}
test_selection: regress
@@ -453,6 +472,8 @@ jobs:
real_s3_region: eu-central-1
rerun_flaky: true
pg_version: ${{ matrix.pg_version }}
# Set pytest session timeout to 25 minutes
session_timeout: '1500'
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

View File

@@ -136,7 +136,7 @@ jobs:
check-linux-arm-build:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, large-arm64 ]
runs-on: [ self-hosted, dev, arm64 ]
env:
# Use release build only, to have less debug info around
@@ -232,20 +232,20 @@ jobs:
- name: Run cargo build
run: |
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
- name: Run cargo test
env:
NEXTEST_RETRIES: 3
run: |
cargo nextest run $CARGO_FEATURES -j$(nproc)
cargo nextest run $CARGO_FEATURES
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
cargo nextest run --package remote_storage --test test_real_s3
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
@@ -255,12 +255,12 @@ jobs:
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
cargo nextest run --package remote_storage --test test_real_azure
check-codestyle-rust-arm:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, large-arm64 ]
runs-on: [ self-hosted, dev, arm64 ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -269,11 +269,6 @@ jobs:
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
steps:
- name: Fix git ownership
run: |
@@ -310,35 +305,31 @@ jobs:
exit 1
fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
if: matrix.build_type == 'debug'
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release)
if: matrix.build_type == 'release'
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
if: matrix.build_type == 'release'
run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
run: cargo doc --workspace --no-deps --document-private-items
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() && matrix.build_type == 'release' }}
if: ${{ !cancelled() }}
run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies
if: ${{ !cancelled() && matrix.build_type == 'release' }}
if: ${{ !cancelled() }}
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() && matrix.build_type == 'release' }}
if: ${{ !cancelled() }}
run: cargo deny check
gather-rust-build-stats:
@@ -347,7 +338,7 @@ jobs:
contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
github.ref_name == 'main'
runs-on: [ self-hosted, large ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
@@ -378,7 +369,7 @@ jobs:
run: make walproposer-lib -j$(nproc)
- name: Produce the build stats
run: cargo build --all --release --timings -j$(nproc)
run: cargo build --all --release --timings
- name: Upload the build stats
id: upload-stats

104
Cargo.lock generated
View File

@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "ahash"
version = "0.8.11"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
dependencies = [
"cfg-if",
"const-random",
@@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "aws-config"
version = "1.3.0"
version = "1.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d"
checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -309,15 +309,14 @@ dependencies = [
"time",
"tokio",
"tracing",
"url",
"zeroize",
]
[[package]]
name = "aws-credential-types"
version = "1.2.0"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9"
checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
dependencies = [
"aws-smithy-async",
"aws-smithy-runtime-api",
@@ -327,9 +326,9 @@ dependencies = [
[[package]]
name = "aws-runtime"
version = "1.2.1"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1"
checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
dependencies = [
"aws-credential-types",
"aws-sigv4",
@@ -374,11 +373,10 @@ dependencies = [
[[package]]
name = "aws-sdk-s3"
version = "1.26.0"
version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d"
checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113"
dependencies = [
"ahash",
"aws-credential-types",
"aws-runtime",
"aws-sigv4",
@@ -393,25 +391,20 @@ dependencies = [
"aws-smithy-xml",
"aws-types",
"bytes",
"fastrand 2.0.0",
"hex",
"hmac",
"http 0.2.9",
"http-body 0.4.5",
"lru",
"once_cell",
"percent-encoding",
"regex-lite",
"sha2",
"tracing",
"url",
]
[[package]]
name = "aws-sdk-sso"
version = "1.22.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601"
checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -431,9 +424,9 @@ dependencies = [
[[package]]
name = "aws-sdk-ssooidc"
version = "1.22.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570"
checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -453,9 +446,9 @@ dependencies = [
[[package]]
name = "aws-sdk-sts"
version = "1.22.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5"
checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -476,9 +469,9 @@ dependencies = [
[[package]]
name = "aws-sigv4"
version = "1.2.1"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57"
checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
dependencies = [
"aws-credential-types",
"aws-smithy-eventstream",
@@ -505,9 +498,9 @@ dependencies = [
[[package]]
name = "aws-smithy-async"
version = "1.2.1"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
dependencies = [
"futures-util",
"pin-project-lite",
@@ -516,9 +509,9 @@ dependencies = [
[[package]]
name = "aws-smithy-checksums"
version = "0.60.7"
version = "0.60.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f"
checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b"
dependencies = [
"aws-smithy-http",
"aws-smithy-types",
@@ -548,9 +541,9 @@ dependencies = [
[[package]]
name = "aws-smithy-http"
version = "0.60.8"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08"
checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
dependencies = [
"aws-smithy-eventstream",
"aws-smithy-runtime-api",
@@ -588,9 +581,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime"
version = "1.5.0"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb"
checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
dependencies = [
"aws-smithy-async",
"aws-smithy-http",
@@ -601,7 +594,6 @@ dependencies = [
"h2 0.3.26",
"http 0.2.9",
"http-body 0.4.5",
"http-body 1.0.0",
"hyper 0.14.26",
"hyper-rustls 0.24.0",
"once_cell",
@@ -614,9 +606,9 @@ dependencies = [
[[package]]
name = "aws-smithy-runtime-api"
version = "1.6.0"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47"
checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
dependencies = [
"aws-smithy-async",
"aws-smithy-types",
@@ -631,19 +623,16 @@ dependencies = [
[[package]]
name = "aws-smithy-types"
version = "1.1.9"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1"
checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
dependencies = [
"base64-simd",
"bytes",
"bytes-utils",
"futures-core",
"http 0.2.9",
"http 1.1.0",
"http-body 0.4.5",
"http-body 1.0.0",
"http-body-util",
"itoa",
"num-integer",
"pin-project-lite",
@@ -657,18 +646,18 @@ dependencies = [
[[package]]
name = "aws-smithy-xml"
version = "0.60.8"
version = "0.60.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55"
checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
dependencies = [
"xmlparser",
]
[[package]]
name = "aws-types"
version = "1.2.0"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814"
checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
dependencies = [
"aws-credential-types",
"aws-smithy-async",
@@ -1359,7 +1348,6 @@ dependencies = [
"tokio-postgres",
"tokio-util",
"toml",
"toml_edit",
"tracing",
"url",
"utils",
@@ -2946,15 +2934,6 @@ version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "lru"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
dependencies = [
"hashbrown 0.14.0",
]
[[package]]
name = "match_cfg"
version = "0.1.0"
@@ -3038,17 +3017,6 @@ dependencies = [
"procfs 0.16.0",
]
[[package]]
name = "measured-tokio"
version = "0.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b4ed0773ddbda3a85e39d0e094934549c410ca686a5095bcd72fb62100252a0"
dependencies = [
"itoa",
"measured",
"tokio",
]
[[package]]
name = "memchr"
version = "2.6.4"
@@ -3090,7 +3058,6 @@ dependencies = [
"libc",
"measured",
"measured-process",
"measured-tokio",
"once_cell",
"procfs 0.14.2",
"prometheus",
@@ -3671,7 +3638,6 @@ dependencies = [
"arc-swap",
"async-compression",
"async-stream",
"async-trait",
"byteorder",
"bytes",
"camino",
@@ -4140,7 +4106,6 @@ name = "postgres_backend"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"bytes",
"futures",
"once_cell",
@@ -4404,7 +4369,6 @@ dependencies = [
"hyper 1.2.0",
"hyper-tungstenite",
"hyper-util",
"indexmap 2.0.1",
"ipnet",
"itertools",
"lasso",

View File

@@ -52,14 +52,14 @@ azure_storage_blobs = "0.19"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
aws-config = { version = "1.3", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.26"
aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.14"
aws-sdk-iam = "1.15.0"
aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.9"
aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
aws-types = "1.2.0"
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.4"
aws-credential-types = "1.1.4"
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
aws-types = "1.1.7"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
@@ -99,7 +99,6 @@ humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.13.0"
indexmap = "2"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
@@ -109,7 +108,6 @@ leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
measured = { version = "0.0.21", features=["lasso"] }
measured-tokio = { version = "0.0.21" }
measured-process = { version = "0.0.21" }
memoffset = "0.8"
native-tls = "0.2"

View File

@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment --cfg=tokio_unstable" cargo build \
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
--bin pg_sni_router \
--bin pageserver \
--bin pagectl \

View File

@@ -81,14 +81,11 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
exit 1; }
mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
VERSION=$*; \
EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
$(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)
# nicer alias to run 'configure'
# Note: I've been unable to use templates for this part of our configuration.

View File

@@ -51,7 +51,6 @@ use tracing::{error, info, warn};
use url::Url;
use compute_api::responses::ComputeStatus;
use compute_api::spec::ComputeSpec;
use compute_tools::compute::{
forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -70,34 +69,6 @@ use compute_tools::swap::resize_swap;
const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> {
let (build_tag, clap_args) = init()?;
let (pg_handle, start_pg_result) = {
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();
let cli_args = process_cli(&clap_args)?;
let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
start_postgres(&clap_args, wait_spec_result)?
// Startup is finished, exit the startup tracing span
};
// PostgreSQL is now running, if startup was successful. Wait until it exits.
let wait_pg_result = wait_postgres(pg_handle)?;
let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
maybe_delay_exit(delay_exit);
deinit_and_exit(wait_pg_result);
}
fn init() -> Result<(String, clap::ArgMatches)> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -112,15 +83,9 @@ fn init() -> Result<(String, clap::ArgMatches)> {
.to_string();
info!("build_tag: {build_tag}");
Ok((build_tag, cli().get_matches()))
}
fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
let pgbin_default = "postgres";
let pgbin = matches
.get_one::<String>("pgbin")
.map(|s| s.as_str())
.unwrap_or(pgbin_default);
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let ext_remote_storage = matches
.get_one::<String>("remote-ext-config")
@@ -148,30 +113,6 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
let spec_path = matches.get_one::<String>("spec-path");
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
Ok(ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
http_port,
spec_json,
spec_path,
resize_swap_on_bind,
})
}
struct ProcessCliResult<'clap> {
connstr: &'clap str,
pgdata: &'clap str,
pgbin: &'clap str,
ext_remote_storage: Option<&'clap str>,
http_port: u16,
spec_json: Option<&'clap String>,
spec_path: Option<&'clap String>,
resize_swap_on_bind: bool,
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
// Extract OpenTelemetry context for the startup actions from the
// TRACEPARENT and TRACESTATE env variables, and attach it to the current
// tracing context.
@@ -208,7 +149,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
if let Ok(val) = std::env::var("TRACESTATE") {
startup_tracing_carrier.insert("tracestate".to_string(), val);
}
if !startup_tracing_carrier.is_empty() {
let startup_context_guard = if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
@@ -218,17 +159,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
Some(guard)
} else {
None
}
}
};
fn try_spec_from_cli(
matches: &clap::ArgMatches,
ProcessCliResult {
spec_json,
spec_path,
..
}: &ProcessCliResult,
) -> Result<CliSpecParams> {
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
@@ -269,34 +201,6 @@ fn try_spec_from_cli(
}
};
Ok(CliSpecParams {
spec,
live_config_allowed,
})
}
struct CliSpecParams {
/// If a spec was provided via CLI or file, the [`ComputeSpec`]
spec: Option<ComputeSpec>,
live_config_allowed: bool,
}
fn wait_spec(
build_tag: String,
ProcessCliResult {
connstr,
pgdata,
pgbin,
ext_remote_storage,
resize_swap_on_bind,
http_port,
..
}: ProcessCliResult,
CliSpecParams {
spec,
live_config_allowed,
}: CliSpecParams,
) -> Result<WaitSpecResult> {
let mut new_state = ComputeState::new();
let spec_set;
@@ -335,6 +239,8 @@ fn wait_spec(
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
@@ -363,29 +269,6 @@ fn wait_spec(
state.start_time = now;
}
Ok(WaitSpecResult {
compute,
http_port,
resize_swap_on_bind,
})
}
struct WaitSpecResult {
compute: Arc<ComputeNode>,
// passed through from ProcessCliResult
http_port: u16,
resize_swap_on_bind: bool,
}
fn start_postgres(
// need to allow unused because `matches` is only used if target_os = "linux"
#[allow(unused_variables)] matches: &clap::ArgMatches,
WaitSpecResult {
compute,
http_port,
resize_swap_on_bind,
}: WaitSpecResult,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
let mut state = compute.state.lock().unwrap();
state.status = ComputeStatus::Init;
@@ -435,10 +318,10 @@ fn start_postgres(
}
}
let extension_server_port: u16 = http_port;
// Start Postgres
let mut pg = None;
let mut exit_code = None;
if !prestartup_failed {
pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg),
@@ -493,7 +376,7 @@ fn start_postgres(
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
let vm_monitor = rt.as_ref().map(|rt| {
let vm_monitor = &rt.as_ref().map(|rt| {
rt.spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(),
@@ -506,41 +389,12 @@ fn start_postgres(
}
}
Ok((
pg,
StartPostgresResult {
delay_exit,
compute,
#[cfg(target_os = "linux")]
rt,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
vm_monitor,
},
))
}
type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
struct StartPostgresResult {
delay_exit: bool,
// passed through from WaitSpecResult
compute: Arc<ComputeNode>,
#[cfg(target_os = "linux")]
rt: Option<tokio::runtime::Runtime>,
#[cfg(target_os = "linux")]
token: tokio_util::sync::CancellationToken,
#[cfg(target_os = "linux")]
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
}
fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
let mut exit_code = None;
if let Some((mut pg, logs_handle)) = pg {
// Startup is finished, exit the startup tracing span
drop(startup_context_guard);
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
@@ -555,25 +409,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
exit_code = ecode.code()
}
Ok(WaitPostgresResult { exit_code })
}
struct WaitPostgresResult {
exit_code: Option<i32>,
}
fn cleanup_after_postgres_exit(
StartPostgresResult {
mut delay_exit,
compute,
#[cfg(target_os = "linux")]
vm_monitor,
#[cfg(target_os = "linux")]
token,
#[cfg(target_os = "linux")]
rt,
}: StartPostgresResult,
) -> Result<bool> {
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -615,19 +450,13 @@ fn cleanup_after_postgres_exit(
error!("error while checking for core dumps: {err:?}");
}
Ok(delay_exit)
}
fn maybe_delay_exit(delay_exit: bool) {
// If launch failed, keep serving HTTP requests for a while, so the cloud
// control plane can get the actual error.
if delay_exit {
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
}
}
fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
// Shutdown trace pipeline gracefully, so that it has a chance to send any
// pending traces before we exit. Shutting down OTEL tracing provider may
// hang for quite some time, see, for example:

View File

@@ -28,7 +28,6 @@ serde_with.workspace = true
tar.workspace = true
thiserror.workspace = true
toml.workspace = true
toml_edit.workspace = true
tokio.workspace = true
tokio-postgres.workspace = true
tokio-util.workspace = true

View File

@@ -9,11 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
use compute_api::spec::ComputeMode;
use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::{
InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
SafekeeperConf,
};
use control_plane::pageserver::PageServerNode;
use control_plane::local_env::{InitForceMode, LocalEnv};
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env};
@@ -55,6 +52,44 @@ const DEFAULT_PG_VERSION: &str = "15";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
fn default_conf(num_pageservers: u16) -> String {
let mut template = format!(
r#"
# Default built-in configuration, defined in main.rs
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
[broker]
listen_addr = '{DEFAULT_BROKER_ADDR}'
[[safekeepers]]
id = {DEFAULT_SAFEKEEPER_ID}
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
"#,
);
for i in 0..num_pageservers {
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
template += &format!(
r#"
[[pageservers]]
id = {pageserver_id}
listen_pg_addr = '127.0.0.1:{pg_port}'
listen_http_addr = '127.0.0.1:{http_port}'
pg_auth_type = '{trust_auth}'
http_auth_type = '{trust_auth}'
"#,
trust_auth = AuthType::Trust,
)
}
template
}
///
/// Timelines tree element used as a value in the HashMap.
///
@@ -98,7 +133,7 @@ fn main() -> Result<()> {
let subcommand_result = match sub_name {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(&env)),
"start" => rt.block_on(handle_start_all(sub_args, &env)),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -117,7 +152,7 @@ fn main() -> Result<()> {
};
match subcommand_result {
Ok(Some(updated_env)) => updated_env.persist_config()?,
Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
Ok(None) => (),
Err(e) => {
eprintln!("command failed: {e:?}");
@@ -306,65 +341,48 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
}
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let num_pageservers = init_match.get_one::<u16>("num-pageservers");
let force = init_match.get_one("force").expect("we set a default value");
// Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
let init_conf: NeonLocalInitConf = if let Some(config_path) =
init_match.get_one::<PathBuf>("config")
{
// User (likely the Python test suite) provided a description of the environment.
if num_pageservers.is_some() {
bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
}
let num_pageservers = init_match
.get_one::<u16>("num-pageservers")
.expect("num-pageservers arg has a default");
// Create config file
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
// load and parse the file
let contents = std::fs::read_to_string(config_path).with_context(|| {
std::fs::read_to_string(config_path).with_context(|| {
format!(
"Could not read configuration file '{}'",
config_path.display()
)
})?;
toml_edit::de::from_str(&contents)?
})?
} else {
// User (likely interactive) did not provide a description of the environment, give them the default
NeonLocalInitConf {
control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
broker: NeonBroker {
listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
},
safekeepers: vec![SafekeeperConf {
id: DEFAULT_SAFEKEEPER_ID,
pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
..Default::default()
}],
pageservers: (0..num_pageservers.copied().unwrap_or(1))
.map(|i| {
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
NeonLocalInitPageserverConf {
id: pageserver_id,
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
listen_http_addr: format!("127.0.0.1:{http_port}"),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
other: Default::default(),
}
})
.collect(),
pg_distrib_dir: None,
neon_distrib_dir: None,
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_compute_hook_api: None,
}
// Built-in default config
default_conf(*num_pageservers)
};
LocalEnv::init(init_conf, force)
.context("materialize initial neon_local environment on disk")?;
Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
let pg_version = init_match
.get_one::<u32>("pg-version")
.copied()
.context("Failed to parse postgres version from the argument string")?;
let mut env =
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
let force = init_match.get_one("force").expect("we set a default value");
env.init(pg_version, force)
.context("Failed to initialize neon repository")?;
// Create remote storage location for default LocalFs remote storage
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
// Initialize pageserver, create initial tenant and timeline.
for ps_conf in &env.pageservers {
PageServerNode::from_env(&env, ps_conf)
.initialize(&pageserver_config_overrides(init_match))
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
exit(1);
});
}
Ok(env)
}
/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -379,6 +397,15 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
PageServerNode::from_env(env, ps_conf)
}
fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
init_match
.get_many::<String>("pageserver-config-override")
.into_iter()
.flatten()
.map(String::as_str)
.collect()
}
async fn handle_tenant(
tenant_match: &ArgMatches,
env: &mut local_env::LocalEnv,
@@ -1049,7 +1076,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
@@ -1075,7 +1105,10 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
exit(1);
}
if let Err(e) = pageserver.start().await {
if let Err(e) = pageserver
.start(&pageserver_config_overrides(subcommand_args))
.await
{
eprintln!("pageserver start failed: {e}");
exit(1);
}
@@ -1202,7 +1235,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
// Endpoints are not started automatically
broker::start_broker_process(env).await?;
@@ -1219,7 +1252,10 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start().await {
if let Err(e) = pageserver
.start(&pageserver_config_overrides(sub_match))
.await
{
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
exit(1);
@@ -1360,6 +1396,13 @@ fn cli() -> Command {
.required(false)
.value_name("stop-mode");
let pageserver_config_args = Arg::new("pageserver-config-override")
.long("pageserver-config-override")
.num_args(1)
.action(ArgAction::Append)
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config")
.num_args(1)
@@ -1393,7 +1436,9 @@ fn cli() -> Command {
let num_pageservers_arg = Arg::new("num-pageservers")
.value_parser(value_parser!(u16))
.long("num-pageservers")
.help("How many pageservers to create (default 1)");
.help("How many pageservers to create (default 1)")
.required(false)
.default_value("1");
let update_catalog = Arg::new("update-catalog")
.value_parser(value_parser!(bool))
@@ -1419,13 +1464,14 @@ fn cli() -> Command {
.subcommand(
Command::new("init")
.about("Initialize a new Neon repository, preparing configs for services to start with")
.arg(pageserver_config_args.clone())
.arg(num_pageservers_arg.clone())
.arg(
Arg::new("config")
.long("config")
.required(false)
.value_parser(value_parser!(PathBuf))
.value_name("config")
.value_name("config"),
)
.arg(pg_version_arg.clone())
.arg(force_arg)
@@ -1507,6 +1553,7 @@ fn cli() -> Command {
.subcommand(Command::new("status"))
.subcommand(Command::new("start")
.about("Start local pageserver")
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("stop")
.about("Stop local pageserver")
@@ -1514,6 +1561,7 @@ fn cli() -> Command {
)
.subcommand(Command::new("restart")
.about("Restart local pageserver")
.arg(pageserver_config_args.clone())
)
)
.subcommand(
@@ -1628,6 +1676,7 @@ fn cli() -> Command {
.subcommand(
Command::new("start")
.about("Start page server and safekeepers")
.arg(pageserver_config_args)
)
.subcommand(
Command::new("stop")

View File

@@ -3,7 +3,7 @@
//! Now it also provides init method which acts like a stub for proper installation
//! script which will use local paths.
use anyhow::{bail, Context};
use anyhow::{bail, ensure, Context};
use clap::ValueEnum;
use postgres_backend::AuthType;
@@ -23,8 +23,6 @@ use utils::{
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
};
use crate::pageserver::PageServerNode;
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
use crate::safekeeper::SafekeeperNode;
pub const DEFAULT_PG_VERSION: u32 = 15;
@@ -36,7 +34,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
// an example.
//
#[derive(PartialEq, Eq, Clone, Debug)]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and
// compute endpoints).
@@ -44,99 +42,59 @@ pub struct LocalEnv {
// This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the NEON_REPO_DIR env variable or
// '.neon' if not given.
#[serde(skip)]
pub base_data_dir: PathBuf,
// Path to postgres distribution. It's expected that "bin", "include",
// "lib", "share" from postgres distribution are there. If at some point
// in time we will be able to run against vanilla postgres we may split that
// to four separate paths and match OS-specific installation layout.
#[serde(default)]
pub pg_distrib_dir: PathBuf,
// Path to pageserver binary.
#[serde(default)]
pub neon_distrib_dir: PathBuf,
// Default tenant ID to use with the 'neon_local' command line utility, when
// --tenant_id is not explicitly specified.
#[serde(default)]
pub default_tenant_id: Option<TenantId>,
// used to issue tokens during e.g pg start
#[serde(default)]
pub private_key_path: PathBuf,
pub broker: NeonBroker,
// Configuration for the storage controller (1 per neon_local environment)
#[serde(default)]
pub storage_controller: NeonStorageControllerConf,
/// This Vec must always contain at least one pageserver
/// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
/// NB: not used anymore except for informing users that they need to change their `.neon/config`.
pub pageservers: Vec<PageServerConf>,
#[serde(default)]
pub safekeepers: Vec<SafekeeperConf>,
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
// be propagated into each pageserver's configuration.
#[serde(default)]
pub control_plane_api: Option<Url>,
// Control plane upcall API for storage controller. If set, this will be propagated into the
// storage controller's configuration.
#[serde(default)]
pub control_plane_compute_hook_api: Option<Url>,
/// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
#[serde(default)]
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
}
/// On-disk state stored in `.neon/config`.
#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
#[serde(default, deny_unknown_fields)]
pub struct OnDiskConfig {
pub pg_distrib_dir: PathBuf,
pub neon_distrib_dir: PathBuf,
pub default_tenant_id: Option<TenantId>,
pub private_key_path: PathBuf,
pub broker: NeonBroker,
pub storage_controller: NeonStorageControllerConf,
#[serde(
skip_serializing,
deserialize_with = "fail_if_pageservers_field_specified"
)]
pub pageservers: Vec<PageServerConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Url>,
pub control_plane_compute_hook_api: Option<Url>,
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
}
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
where
D: serde::Deserializer<'de>,
{
Err(serde::de::Error::custom(
"The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
Please remove the `pageservers` from your .neon/config.",
))
}
/// The description of the neon_local env to be initialized by `neon_local init --config`.
#[derive(Clone, Debug, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct NeonLocalInitConf {
// TODO: do we need this? Seems unused
pub pg_distrib_dir: Option<PathBuf>,
// TODO: do we need this? Seems unused
pub neon_distrib_dir: Option<PathBuf>,
pub default_tenant_id: TenantId,
pub broker: NeonBroker,
pub storage_controller: Option<NeonStorageControllerConf>,
pub pageservers: Vec<NeonLocalInitPageserverConf>,
pub safekeepers: Vec<SafekeeperConf>,
pub control_plane_api: Option<Option<Url>>,
pub control_plane_compute_hook_api: Option<Option<Url>>,
}
/// Broker config for cluster internal communication.
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
@@ -183,18 +141,24 @@ impl NeonBroker {
}
}
// neon_local needs to know this subset of pageserver configuration.
// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
// It can get stale if `pageserver.toml` is changed.
// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default, deny_unknown_fields)]
pub struct PageServerConf {
// node id
pub id: NodeId,
// Pageserver connection settings
pub listen_pg_addr: String,
pub listen_http_addr: String,
// auth type used for the PG and HTTP ports
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
pub(crate) virtual_file_io_engine: Option<String>,
pub(crate) get_vectored_impl: Option<String>,
pub(crate) get_impl: Option<String>,
pub(crate) validate_vectored_get: Option<bool>,
}
impl Default for PageServerConf {
@@ -205,40 +169,10 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
pg_auth_type: AuthType::Trust,
http_auth_type: AuthType::Trust,
}
}
}
/// The toml that can be passed to `neon_local init --config`.
/// This is a subset of the `pageserver.toml` configuration.
// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
pub struct NeonLocalInitPageserverConf {
pub id: NodeId,
pub listen_pg_addr: String,
pub listen_http_addr: String,
pub pg_auth_type: AuthType,
pub http_auth_type: AuthType,
#[serde(flatten)]
pub other: HashMap<String, toml::Value>,
}
impl From<&NeonLocalInitPageserverConf> for PageServerConf {
fn from(conf: &NeonLocalInitPageserverConf) -> Self {
let NeonLocalInitPageserverConf {
id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
other: _,
} = conf;
Self {
id: *id,
listen_pg_addr: listen_pg_addr.clone(),
listen_http_addr: listen_http_addr.clone(),
pg_auth_type: *pg_auth_type,
http_auth_type: *http_auth_type,
virtual_file_io_engine: None,
get_vectored_impl: None,
get_impl: None,
validate_vectored_get: None,
}
}
}
@@ -426,7 +360,44 @@ impl LocalEnv {
.collect()
}
/// Construct `Self` from on-disk state.
/// Create a LocalEnv from a config file.
///
/// Unlike 'load_config', this function fills in any defaults that are missing
/// from the config file.
pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
let mut env: LocalEnv = toml::from_str(toml)?;
// Find postgres binaries.
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
// Note that later in the code we assume, that distrib dirs follow the same pattern
// for all postgres versions.
if env.pg_distrib_dir == Path::new("") {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
env.pg_distrib_dir = postgres_bin.into();
} else {
let cwd = env::current_dir()?;
env.pg_distrib_dir = cwd.join("pg_install")
}
}
// Find neon binaries.
if env.neon_distrib_dir == Path::new("") {
env::current_exe()?
.parent()
.unwrap()
.clone_into(&mut env.neon_distrib_dir);
}
if env.pageservers.is_empty() {
anyhow::bail!("Configuration must contain at least one pageserver");
}
env.base_data_dir = base_path();
Ok(env)
}
/// Locate and load config
pub fn load_config() -> anyhow::Result<Self> {
let repopath = base_path();
@@ -440,129 +411,38 @@ impl LocalEnv {
// TODO: check that it looks like a neon repository
// load and parse file
let config_file_contents = fs::read_to_string(repopath.join("config"))?;
let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?;
let mut env = {
let OnDiskConfig {
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
private_key_path,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
branch_name_mappings,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.clone(),
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
private_key_path,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
branch_name_mappings,
}
};
let config = fs::read_to_string(repopath.join("config"))?;
let mut env: LocalEnv = toml::from_str(config.as_str())?;
// The source of truth for pageserver configuration is the pageserver.toml.
assert!(
env.pageservers.is_empty(),
"we ensure this during deserialization"
);
env.pageservers = {
let iter = std::fs::read_dir(&repopath).context("open dir")?;
let mut pageservers = Vec::new();
for res in iter {
let dentry = res?;
const PREFIX: &str = "pageserver_";
let dentry_name = dentry
.file_name()
.into_string()
.ok()
.with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
.unwrap();
if !dentry_name.starts_with(PREFIX) {
continue;
}
if !dentry.file_type().context("determine file type")?.is_dir() {
anyhow::bail!("expected a directory, got {:?}", dentry.path());
}
let id = dentry_name[PREFIX.len()..]
.parse::<NodeId>()
.with_context(|| format!("parse id from {:?}", dentry.path()))?;
// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
#[derive(serde::Serialize, serde::Deserialize)]
// (allow unknown fields, unlike PageServerConf)
struct PageserverConfigTomlSubset {
id: NodeId,
listen_pg_addr: String,
listen_http_addr: String,
pg_auth_type: AuthType,
http_auth_type: AuthType,
}
let config_toml_path = dentry.path().join("pageserver.toml");
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
&std::fs::read_to_string(&config_toml_path)
.with_context(|| format!("read {:?}", config_toml_path))?,
)
.context("parse pageserver.toml")?;
let PageserverConfigTomlSubset {
id: config_toml_id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
} = config_toml;
let conf = PageServerConf {
id: {
anyhow::ensure!(
config_toml_id == id,
"id mismatch: config_toml.id={config_toml_id} id={id}",
);
id
},
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
};
pageservers.push(conf);
}
pageservers
};
env.base_data_dir = repopath;
Ok(env)
}
pub fn persist_config(&self) -> anyhow::Result<()> {
Self::persist_config_impl(
&self.base_data_dir,
&OnDiskConfig {
pg_distrib_dir: self.pg_distrib_dir.clone(),
neon_distrib_dir: self.neon_distrib_dir.clone(),
default_tenant_id: self.default_tenant_id,
private_key_path: self.private_key_path.clone(),
broker: self.broker.clone(),
storage_controller: self.storage_controller.clone(),
pageservers: vec![], // it's skip_serializing anyway
safekeepers: self.safekeepers.clone(),
control_plane_api: self.control_plane_api.clone(),
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
branch_name_mappings: self.branch_name_mappings.clone(),
},
)
}
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
// Currently, the user first passes a config file with 'neon_local init --config=<path>'
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .neon/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
let mut conf_content = r#"# This file describes a local deployment of the page server
# and safekeeeper node. It is read by the 'neon_local' command-line
# utility.
"#
.to_string();
// Convert the LocalEnv to a toml file.
//
// This could be as simple as this:
//
// conf_content += &toml::to_string_pretty(env)?;
//
// But it results in a "values must be emitted before tables". I'm not sure
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
// Maybe rust reorders the fields to squeeze avoid padding or something?
// In any case, converting to toml::Value first, and serializing that, works.
// See https://github.com/alexcrichton/toml-rs/issues/142
conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
let conf_content = &toml::to_string_pretty(config)?;
let target_config_path = base_path.join("config");
fs::write(&target_config_path, conf_content).with_context(|| {
format!(
@@ -587,13 +467,17 @@ impl LocalEnv {
}
}
/// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
let base_path = base_path();
assert_ne!(base_path, Path::new(""));
let base_path = &base_path;
//
// Initialize a new Neon repository
//
pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
// check if config already exists
let base_path = &self.base_data_dir;
ensure!(
base_path != Path::new(""),
"repository base path is missing"
);
// create base_path dir
if base_path.exists() {
match force {
InitForceMode::MustNotExist => {
@@ -625,96 +509,70 @@ impl LocalEnv {
}
}
}
if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
bail!(
"Can't find postgres binary at {}",
self.pg_bin_dir(pg_version)?.display()
);
}
for binary in ["pageserver", "safekeeper"] {
if !self.neon_distrib_dir.join(binary).exists() {
bail!(
"Can't find binary '{binary}' in neon distrib dir '{}'",
self.neon_distrib_dir.display()
);
}
}
if !base_path.exists() {
fs::create_dir(base_path)?;
}
let NeonLocalInitConf {
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
broker,
storage_controller,
pageservers,
safekeepers,
control_plane_api,
control_plane_compute_hook_api,
} = conf;
// Find postgres binaries.
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
// Note that later in the code we assume, that distrib dirs follow the same pattern
// for all postgres versions.
let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
postgres_bin.into()
} else {
let cwd = env::current_dir().unwrap();
cwd.join("pg_install")
}
});
// Find neon binaries.
let neon_distrib_dir = neon_distrib_dir
.unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
// Generate keypair for JWT.
//
// The keypair is only needed if authentication is enabled in any of the
// components. For convenience, we generate the keypair even if authentication
// is not enabled, so that you can easily enable it after the initialization
// step.
generate_auth_keys(
base_path.join("auth_private_key.pem").as_path(),
base_path.join("auth_public_key.pem").as_path(),
)
.context("generate auth keys")?;
let private_key_path = PathBuf::from("auth_private_key.pem");
// create the runtime type because the remaining initialization code below needs
// a LocalEnv instance op operation
// TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state
let env = LocalEnv {
base_data_dir: base_path.clone(),
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id: Some(default_tenant_id),
private_key_path,
broker,
storage_controller: storage_controller.unwrap_or_default(),
pageservers: pageservers.iter().map(Into::into).collect(),
safekeepers,
control_plane_api: control_plane_api.unwrap_or_default(),
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
branch_name_mappings: Default::default(),
};
// create endpoints dir
fs::create_dir_all(env.endpoints_path())?;
// create safekeeper dirs
for safekeeper in &env.safekeepers {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
// step. However, if the key generation fails, we treat it as non-fatal if
// authentication was not enabled.
if self.private_key_path == PathBuf::new() {
match generate_auth_keys(
base_path.join("auth_private_key.pem").as_path(),
base_path.join("auth_public_key.pem").as_path(),
) {
Ok(()) => {
self.private_key_path = PathBuf::from("auth_private_key.pem");
}
Err(e) => {
if !self.auth_keys_needed() {
eprintln!("Could not generate keypair for JWT authentication: {e}");
eprintln!("Continuing anyway because authentication was not enabled");
self.private_key_path = PathBuf::from("auth_private_key.pem");
} else {
return Err(e);
}
}
}
}
// initialize pageserver state
for (i, ps) in pageservers.into_iter().enumerate() {
let runtime_ps = &env.pageservers[i];
assert_eq!(&PageServerConf::from(&ps), runtime_ps);
fs::create_dir(env.pageserver_data_dir(ps.id))?;
PageServerNode::from_env(&env, runtime_ps)
.initialize(ps)
.context("pageserver init failed")?;
fs::create_dir_all(self.endpoints_path())?;
for safekeeper in &self.safekeepers {
fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
}
// setup remote remote location for default LocalFs remote storage
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
self.persist_config(base_path)
}
env.persist_config()
fn auth_keys_needed(&self) -> bool {
self.pageservers.iter().any(|ps| {
ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
}) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
}
}
pub fn base_path() -> PathBuf {
fn base_path() -> PathBuf {
match std::env::var_os("NEON_REPO_DIR") {
Some(val) => PathBuf::from(val),
None => PathBuf::from(".neon"),
@@ -757,3 +615,31 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple_conf_parsing() {
let simple_conf_toml = include_str!("../simple.conf");
let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
assert!(
simple_conf_parse_result.is_ok(),
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
);
let string_to_replace = "listen_addr = '127.0.0.1:50051'";
let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
assert!(
spoiled_url_toml.contains(spoiled_url_str),
"Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
);
let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
assert!(
spoiled_url_parse_result.is_err(),
"expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
);
}
}

View File

@@ -4,21 +4,21 @@
//!
//! .neon/
//!
use std::borrow::Cow;
use std::collections::HashMap;
use std::io;
use std::io::Write;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::str::FromStr;
use std::process::Command;
use std::time::Duration;
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use futures::SinkExt;
use pageserver_api::models::{
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
TimelineInfo,
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api;
@@ -30,7 +30,7 @@ use utils::{
lsn::Lsn,
};
use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
/// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -74,23 +74,71 @@ impl PageServerNode {
}
}
fn pageserver_init_make_toml(
&self,
conf: NeonLocalInitPageserverConf,
) -> anyhow::Result<toml_edit::Document> {
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
/// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
///
/// These all end up on the command line of the `pageserver` binary.
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!(
"pg_distrib_dir='{}'",
self.env.pg_distrib_dir_raw().display()
);
let PageServerConf {
id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
virtual_file_io_engine,
get_vectored_impl,
get_impl,
validate_vectored_get,
} = &self.conf;
let id = format!("id={}", id);
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
format!("virtual_file_io_engine='{virtual_file_io_engine}'")
} else {
String::new()
};
let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
format!("get_vectored_impl='{get_vectored_impl}'")
} else {
String::new()
};
let get_impl = if let Some(get_impl) = get_impl {
format!("get_impl='{get_impl}'")
} else {
String::new()
};
let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
format!("validate_vectored_get={validate_vectored_get}")
} else {
String::new()
};
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
let mut overrides = vec![
id,
pg_distrib_dir_param,
http_auth_type_param,
pg_auth_type_param,
listen_http_addr_param,
listen_pg_addr_param,
broker_endpoint_param,
virtual_file_io_engine,
get_vectored_impl,
get_impl,
validate_vectored_get,
];
if let Some(control_plane_api) = &self.env.control_plane_api {
overrides.push(format!(
@@ -100,7 +148,7 @@ impl PageServerNode {
// Storage controller uses the same auth as pageserver: if JWT is enabled
// for us, we will also need it to talk to them.
if matches!(conf.http_auth_type, AuthType::NeonJWT) {
if matches!(http_auth_type, AuthType::NeonJWT) {
let jwt_token = self
.env
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -109,40 +157,31 @@ impl PageServerNode {
}
}
if !conf.other.contains_key("remote_storage") {
if !cli_overrides
.iter()
.any(|c| c.starts_with("remote_storage"))
{
overrides.push(format!(
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
));
}
if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
// Keys are generated in the toplevel repo dir, pageservers' workdirs
// are one level below that, so refer to keys with ../
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
}
// Apply the user-provided overrides
overrides.push(
toml_edit::ser::to_string_pretty(&conf)
.expect("we deserialized this from toml earlier"),
);
overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
// Turn `overrides` into a toml document.
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
let mut config_toml = toml_edit::Document::new();
for fragment_str in overrides {
let fragment = toml_edit::Document::from_str(&fragment_str)
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
for (key, item) in fragment.iter() {
config_toml.insert(key, item.clone());
}
}
Ok(config_toml)
overrides
}
/// Initializes a pageserver node by creating its config with the overrides provided.
pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
self.pageserver_init(conf)
pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
// First, run `pageserver --init` and wait for it to write a config into FS and exit.
self.pageserver_init(config_overrides)
.with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
}
@@ -158,11 +197,11 @@ impl PageServerNode {
.expect("non-Unicode path")
}
pub async fn start(&self) -> anyhow::Result<()> {
self.start_node().await
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
self.start_node(config_overrides, false).await
}
fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
let datadir = self.repo_path();
let node_id = self.conf.id;
println!(
@@ -173,20 +212,29 @@ impl PageServerNode {
);
io::stdout().flush()?;
let config = self
.pageserver_init_make_toml(conf)
.context("make pageserver toml")?;
let config_file_path = datadir.join("pageserver.toml");
let mut config_file = std::fs::OpenOptions::new()
.create_new(true)
.write(true)
.open(&config_file_path)
.with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?;
config_file
.write_all(config.to_string().as_bytes())
.context("write pageserver toml")?;
drop(config_file);
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
if !datadir.exists() {
std::fs::create_dir(&datadir)?;
}
let datadir_path_str = datadir.to_str().with_context(|| {
format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
})?;
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
args.push(Cow::Borrowed("--init"));
let init_output = Command::new(self.env.pageserver_bin())
.args(args.iter().map(Cow::as_ref))
.envs(self.pageserver_env_variables()?)
.output()
.with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
anyhow::ensure!(
init_output.status.success(),
"Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
node_id,
String::from_utf8_lossy(&init_output.stdout),
String::from_utf8_lossy(&init_output.stderr),
);
// Write metadata file, used by pageserver on startup to register itself with
// the storage controller
@@ -214,7 +262,11 @@ impl PageServerNode {
Ok(())
}
async fn start_node(&self) -> anyhow::Result<()> {
async fn start_node(
&self,
config_overrides: &[&str],
update_config: bool,
) -> anyhow::Result<()> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
@@ -231,12 +283,15 @@ impl PageServerNode {
self.conf.id, datadir,
)
})?;
let args = vec!["-D", datadir_path_str];
let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
if update_config {
args.push(Cow::Borrowed("--update-config"));
}
background_process::start_process(
"pageserver",
&datadir,
&self.env.pageserver_bin(),
args,
args.iter().map(Cow::as_ref),
self.pageserver_env_variables()?,
background_process::InitialPidFile::Expect(self.pid_file()),
|| async {
@@ -253,6 +308,22 @@ impl PageServerNode {
Ok(())
}
fn pageserver_basic_args<'a>(
&self,
config_overrides: &'a [&'a str],
datadir_path_str: &'a str,
) -> Vec<Cow<'a, str>> {
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
let overrides = self.neon_local_overrides(config_overrides);
for config_override in overrides {
args.push(Cow::Borrowed("-c"));
args.push(Cow::Owned(config_override));
}
args
}
fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
// FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
// needs a token, and how to generate that token, seems independent to whether
@@ -378,11 +449,11 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_aux_file_policy: settings
.remove("switch_aux_file_policy")
.map(|x| x.parse::<AuxFilePolicy>())
switch_to_aux_file_v2: settings
.remove("switch_to_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -501,11 +572,11 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_aux_file_policy: settings
.remove("switch_aux_file_policy")
.map(|x| x.parse::<AuxFilePolicy>())
switch_to_aux_file_v2: settings
.remove("switch_to_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'switch_aux_file_policy'")?,
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
}
};

View File

@@ -18,9 +18,6 @@ workspace_hack.workspace = true
procfs.workspace = true
measured-process.workspace = true
[target.'cfg(tokio_unstable)'.dependencies]
measured-tokio.workspace = true
[dev-dependencies]
rand = "0.8"
rand_distr = "0.4.3"

View File

@@ -148,11 +148,6 @@ pub struct NeonMetrics {
#[metric(init = measured_process::ProcessCollector::for_self())]
process: measured_process::ProcessCollector,
#[cfg(tokio_unstable)]
#[metric(namespace = "tokio")]
#[metric(init = measured_tokio::NamedRuntimesCollector::new())]
pub tokio: measured_tokio::NamedRuntimesCollector,
#[metric(namespace = "libmetrics")]
#[metric(init = LibMetrics::new(build_info))]
libmetrics: LibMetrics,
@@ -485,15 +480,6 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
let id = self.vec.with_labels(labels);
self.vec.remove_metric(id)
}
pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
let id = self.vec.with_labels(labels);
let metric = self.vec.get_metric(id);
let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
inc.saturating_sub(dec)
}
}
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>

View File

@@ -240,7 +240,7 @@ impl<'a> ShardedRange<'a> {
/// pages that would not actually be stored on this node.
///
/// Don't use this function in code that works with physical entities like layer files.
pub fn raw_size(range: &Range<Key>) -> u32 {
fn raw_size(range: &Range<Key>) -> u32 {
if is_contiguous_range(range) {
contiguous_range_len(range)
} else {

View File

@@ -1,4 +1,3 @@
pub mod detach_ancestor;
pub mod partitioning;
pub mod utilization;
@@ -9,7 +8,6 @@ use std::{
collections::HashMap,
io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize},
str::FromStr,
time::{Duration, SystemTime},
};
@@ -305,31 +303,7 @@ pub struct TenantConfig {
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub switch_aux_file_policy: Option<AuxFilePolicy>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum AuxFilePolicy {
V1,
V2,
CrossValidation,
}
impl FromStr for AuxFilePolicy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.to_lowercase();
if s == "v1" {
Ok(Self::V1)
} else if s == "v2" {
Ok(Self::V2)
} else if s == "crossvalidation" || s == "cross_validation" {
Ok(Self::CrossValidation)
} else {
anyhow::bail!("cannot parse {} to aux file policy", s)
}
}
pub switch_to_aux_file_v2: Option<bool>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]

View File

@@ -1,6 +0,0 @@
use utils::id::TimelineId;
#[derive(Default, serde::Serialize)]
pub struct AncestorDetached {
pub reparented_timelines: Vec<TimelineId>,
}

View File

@@ -5,7 +5,6 @@ edition.workspace = true
license.workspace = true
[dependencies]
async-trait.workspace = true
anyhow.workspace = true
bytes.workspace = true
futures.workspace = true

View File

@@ -78,17 +78,16 @@ pub fn is_expected_io_error(e: &io::Error) -> bool {
)
}
#[async_trait::async_trait]
pub trait Handler<IO> {
/// Handle single query.
/// postgres_backend will issue ReadyForQuery after calling this (this
/// might be not what we want after CopyData streaming, but currently we don't
/// care). It will also flush out the output buffer.
async fn process_query(
fn process_query(
&mut self,
pgb: &mut PostgresBackend<IO>,
query_string: &str,
) -> Result<(), QueryError>;
) -> impl Future<Output = Result<(), QueryError>> + Send;
/// Called on startup packet receival, allows to process params.
///

View File

@@ -22,7 +22,6 @@ async fn make_tcp_pair() -> (TcpStream, TcpStream) {
struct TestHandler {}
#[async_trait::async_trait]
impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
// return single col 'hey' for any query
async fn process_query(

View File

@@ -27,7 +27,7 @@ use aws_config::{
};
use aws_credential_types::provider::SharedCredentialsProvider;
use aws_sdk_s3::{
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
@@ -75,13 +75,13 @@ struct GetObjectRequest {
}
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
tracing::debug!(
"Creating s3 remote storage for S3 bucket {}",
remote_storage_config.bucket_name
aws_config.bucket_name
);
let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
let region = Some(Region::new(aws_config.bucket_region.clone()));
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
@@ -113,38 +113,6 @@ impl S3Bucket {
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
#[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
BehaviorVersion::v2023_11_09(),
)
.region(region)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
s.spawn(|| {
// TODO: make this function async.
tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap()
.block_on(sdk_config_loader.load())
})
.join()
.unwrap()
});
let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
// Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
// (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
s3_config_builder = s3_config_builder
.endpoint_url(custom_endpoint)
.force_path_style(true);
}
// We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling
// responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one
// attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
@@ -152,36 +120,42 @@ impl S3Bucket {
retry_config
.set_max_attempts(Some(1))
.set_mode(Some(RetryMode::Adaptive));
s3_config_builder = s3_config_builder.retry_config(retry_config.build());
let s3_config = s3_config_builder.build();
let client = aws_sdk_s3::Client::from_conf(s3_config);
let mut config_builder = Builder::default()
.behavior_version(BehaviorVersion::v2023_11_09())
.region(region)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.retry_config(retry_config.build())
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
let prefix_in_bucket = remote_storage_config
.prefix_in_bucket
.as_deref()
.map(|prefix| {
let mut prefix = prefix;
while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix = &prefix[1..]
}
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder
.endpoint_url(custom_endpoint)
.force_path_style(true);
}
let mut prefix = prefix.to_string();
while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix.pop();
}
prefix
});
let client = Client::from_conf(config_builder.build());
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
let mut prefix = prefix;
while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix = &prefix[1..]
}
let mut prefix = prefix.to_string();
while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
prefix.pop();
}
prefix
});
Ok(Self {
client,
bucket_name: remote_storage_config.bucket_name.clone(),
max_keys_per_list_response: remote_storage_config.max_keys_per_list_response,
bucket_name: aws_config.bucket_name.clone(),
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
concurrency_limiter: ConcurrencyLimiter::new(
remote_storage_config.concurrency_limit.get(),
),
upload_storage_class: remote_storage_config.upload_storage_class.clone(),
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
upload_storage_class: aws_config.upload_storage_class.clone(),
timeout,
})
}

View File

@@ -3,7 +3,7 @@
//! # Example
//!
//! ```
//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async {
//! # tokio_test::block_on(async {
//! use utils::poison::Poison;
//! use std::time::Duration;
//!

View File

@@ -50,14 +50,6 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
}
}
extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
(*api).update_donor(&mut (*donor), donor_lsn)
}
}
extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
unsafe {
let callback_data = (*(*wp).config).callback_data;
@@ -399,7 +391,6 @@ pub(crate) fn create_api() -> walproposer_api {
get_shmem_state: Some(get_shmem_state),
start_streaming: Some(start_streaming),
get_flush_rec_ptr: Some(get_flush_rec_ptr),
update_donor: Some(update_donor),
get_current_timestamp: Some(get_current_timestamp),
conn_error_message: Some(conn_error_message),
conn_status: Some(conn_status),
@@ -430,32 +421,6 @@ pub(crate) fn create_api() -> walproposer_api {
}
}
pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
let empty_feedback = crate::bindings::PageserverFeedback {
present: false,
currentClusterSize: 0,
last_received_lsn: 0,
disk_consistent_lsn: 0,
remote_consistent_lsn: 0,
replytime: 0,
shard_number: 0,
};
crate::bindings::WalproposerShmemState {
propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
donor_name: [0; 64],
donor_conninfo: [0; 1024],
donor_lsn: 0,
mutex: 0,
mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 },
backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 },
currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
shard_ps_feedback: [empty_feedback; 128],
num_shards: 0,
min_ps_feedback: empty_feedback,
}
}
impl std::fmt::Display for Level {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{:?}", self)

View File

@@ -1,5 +1,8 @@
use std::ffi::CString;
use postgres_ffi::WAL_SEGMENT_SIZE;
use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::{
api_bindings::{create_api, take_vec_u8, Level},
bindings::{
@@ -7,8 +10,6 @@ use crate::{
WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
},
};
use postgres_ffi::WAL_SEGMENT_SIZE;
use utils::{id::TenantTimelineId, lsn::Lsn};
/// Rust high-level wrapper for C walproposer API. Many methods are not required
/// for simple cases, hence todo!() in default implementations.
@@ -27,10 +28,6 @@ pub trait ApiImpl {
todo!()
}
fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) {
todo!()
}
fn get_current_timestamp(&self) -> i64 {
todo!()
}
@@ -277,7 +274,6 @@ mod tests {
sync::{atomic::AtomicUsize, mpsc::sync_channel},
};
use std::cell::UnsafeCell;
use utils::id::TenantTimelineId;
use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
@@ -301,8 +297,6 @@ mod tests {
replies_ptr: AtomicUsize,
// channel to send LSN to the main thread
sync_channel: std::sync::mpsc::SyncSender<u64>,
// Shmem state, used for storing donor info
shmem: UnsafeCell<crate::bindings::WalproposerShmemState>,
}
impl MockImpl {
@@ -333,22 +327,11 @@ mod tests {
}
impl ApiImpl for MockImpl {
fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
self.shmem.get()
}
fn get_current_timestamp(&self) -> i64 {
println!("get_current_timestamp");
0
}
fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) {
let mut shmem = unsafe { *self.get_shmem_state() };
shmem.propEpochStartLsn.value = donor_lsn;
shmem.donor_conninfo = donor.conninfo;
shmem.donor_lsn = donor_lsn;
}
fn conn_status(
&self,
_: &mut crate::bindings::Safekeeper,
@@ -524,7 +507,6 @@ mod tests {
],
replies_ptr: AtomicUsize::new(0),
sync_channel: sender,
shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
});
let config = crate::walproposer::Config {
ttid,

View File

@@ -15,7 +15,6 @@ anyhow.workspace = true
arc-swap.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true
bytes.workspace = true
camino.workspace = true

View File

@@ -1,7 +1,7 @@
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::storage_layer::PersistentLayerDesc;
use pageserver_api::shard::TenantShardId;
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
@@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
let mut updates = layer_map.batch_update();
for fname in filenames {
let fname = fname.unwrap();
let fname = LayerName::from_str(&fname).unwrap();
let fname = LayerFileName::from_str(&fname).unwrap();
let layer = PersistentLayerDesc::from(fname);
let lsn_range = layer.get_lsn_range();

View File

@@ -24,9 +24,7 @@ use tracing::{debug, info};
use std::collections::{HashSet, VecDeque};
use std::ops::Range;
use crate::helpers::{
accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
};
use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
use crate::interface::*;
use utils::lsn::Lsn;
@@ -106,13 +104,7 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
ctx,
)
.await?;
if current_level_target_height == u64::MAX {
// our target height includes all possible lsns
info!(
level = current_level_no,
depth = depth,
"compaction loop reached max current_level_target_height"
);
if target_file_size == u64::MAX {
break;
}
current_level_no += 1;
@@ -543,10 +535,7 @@ where
}
}
// Open stream
let key_value_stream =
std::pin::pin!(merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
.await?
.map(Result::<_, anyhow::Error>::Ok));
let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
let mut new_jobs = Vec::new();
// Slide a window through the keyspace

View File

@@ -9,12 +9,10 @@ use pageserver_api::shard::ShardIdentity;
use pin_project_lite::pin_project;
use std::collections::BinaryHeap;
use std::collections::VecDeque;
use std::fmt::Display;
use std::future::Future;
use std::ops::{DerefMut, Range};
use std::pin::Pin;
use std::task::{ready, Poll};
use utils::lsn::Lsn;
pub fn keyspace_total_size<K>(
keyspace: &CompactionKeySpace<K>,
@@ -110,40 +108,17 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
}
}
pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
layers: &'a [E::DeltaLayer],
ctx: &'a E::RequestContext,
) -> anyhow::Result<impl Stream<Item = <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>
{
let mut keys = Vec::new();
for l in layers {
// Boxing and casting to LoadFuture is required to obtain the right Sync bound.
// If we do l.load_keys(ctx).await? directly, there is a compilation error.
let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx));
keys.extend(load_future.await?.into_iter());
}
keys.sort_by_key(|k| (k.key(), k.lsn()));
let stream = futures::stream::iter(keys.into_iter());
Ok(stream)
}
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
Unloaded(&'a E::DeltaLayer),
}
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
fn min_key(&self) -> E::Key {
fn key(&self) -> E::Key {
match self {
Self::Loaded(entries) => entries.front().unwrap().key(),
Self::Unloaded(dl) => dl.key_range().start,
}
}
fn min_lsn(&self) -> Lsn {
match self {
Self::Loaded(entries) => entries.front().unwrap().lsn(),
Self::Unloaded(dl) => dl.lsn_range().start,
}
}
}
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
@@ -153,12 +128,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
// reverse order so that we get a min-heap
(other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
other.key().cmp(&self.key())
}
}
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == std::cmp::Ordering::Equal
self.key().eq(&other.key())
}
}
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
@@ -239,7 +214,7 @@ pub struct KeySize<K> {
pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
where
K: Eq + PartialOrd + Display + Copy,
K: Eq,
I: Stream<Item = Result<D, E>>,
D: CompactionDeltaEntry<'a, K>,
{
@@ -254,15 +229,12 @@ where
num_values: 1,
size: first.size(),
};
let mut last_key = accum.key;
while let Some(this) = input.next().await {
let this = this?;
if this.key() == accum.key {
accum.size += this.size();
accum.num_values += 1;
} else {
assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
last_key = accum.key;
yield accum;
accum = KeySize {
key: this.key(),
@@ -271,7 +243,6 @@ where
};
}
}
assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
yield accum;
}
}

View File

@@ -1,20 +1,5 @@
use once_cell::sync::OnceCell;
use pageserver_compaction::interface::CompactionLayer;
use pageserver_compaction::simulator::MockTimeline;
use utils::logging;
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
pub(crate) fn setup_logging() {
LOG_HANDLE.get_or_init(|| {
logging::init(
logging::LogFormat::Test,
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)
.expect("Failed to init test logging")
});
}
/// Test the extreme case that there are so many updates for a single key that
/// even if we produce an extremely narrow delta layer, spanning just that one
@@ -26,14 +11,13 @@ pub(crate) fn setup_logging() {
#[ignore]
#[tokio::test]
async fn test_many_updates_for_single_key() {
setup_logging();
let mut executor = MockTimeline::new();
executor.target_file_size = 1_000_000; // 1 MB
executor.target_file_size = 10_000_000; // 10 MB
// Ingest 10 MB of updates to a single key.
// Ingest 100 MB of updates to a single key.
for _ in 1..1000 {
executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
executor.ingest_uniform(1000, 10, &(0..1)).unwrap();
executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
executor.compact().await.unwrap();
}
@@ -49,26 +33,3 @@ async fn test_many_updates_for_single_key() {
}
}
}
#[tokio::test]
async fn test_simple_updates() {
setup_logging();
let mut executor = MockTimeline::new();
executor.target_file_size = 500_000; // 500 KB
// Ingest some traffic.
for _ in 1..400 {
executor.ingest_uniform(100, 500, &(0..100_000)).unwrap();
}
for l in executor.live_layers.iter() {
println!("layer {}: {}", l.short_id(), l.file_size());
}
println!("Running compaction...");
executor.compact().await.unwrap();
for l in executor.live_layers.iter() {
println!("layer {}: {}", l.short_id(), l.file_size());
}
}

View File

@@ -28,8 +28,6 @@
//! # From an `index_part.json` in S3
//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
//!
//! # enrich with lines for gc_cutoff and a child branch point
//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg
//! ```
//!
//! ## Viewing
@@ -50,7 +48,7 @@
//! ```
//!
use anyhow::{Context, Result};
use anyhow::Result;
use pageserver::repository::Key;
use pageserver::METADATA_FILE_NAME;
use std::cmp::Ordering;
@@ -92,33 +90,6 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
(keys, lsns)
}
#[derive(Clone, Copy)]
enum LineKind {
GcCutoff,
Branch,
}
impl From<LineKind> for Fill {
fn from(value: LineKind) -> Self {
match value {
LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
}
}
}
impl FromStr for LineKind {
type Err = anyhow::Error;
fn from_str(s: &str) -> std::prelude::v1::Result<Self, Self::Err> {
Ok(match s {
"gc_cutoff" => LineKind::GcCutoff,
"branch" => LineKind::Branch,
_ => anyhow::bail!("unsupported linekind: {s}"),
})
}
}
pub fn main() -> Result<()> {
// Parse layer filenames from stdin
struct Layer {
@@ -128,29 +99,8 @@ pub fn main() -> Result<()> {
}
let mut files: Vec<Layer> = vec![];
let stdin = io::stdin();
let mut lines: Vec<(Lsn, LineKind)> = vec![];
for (lineno, line) in stdin.lock().lines().enumerate() {
let lineno = lineno + 1;
for line in stdin.lock().lines() {
let line = line.unwrap();
if let Some((kind, lsn)) = line.split_once(':') {
let (kind, lsn) = LineKind::from_str(kind)
.context("parse kind")
.and_then(|kind| {
if lsn.contains('/') {
Lsn::from_str(lsn)
} else {
Lsn::from_hex(lsn)
}
.map(|lsn| (kind, lsn))
.context("parse lsn")
})
.with_context(|| format!("parse {line:?} on {lineno}"))?;
lines.push((lsn, kind));
continue;
}
let line = PathBuf::from_str(&line).unwrap();
let filename = line.file_name().unwrap();
let filename = filename.to_str().unwrap();
@@ -167,9 +117,8 @@ pub fn main() -> Result<()> {
}
// Collect all coordinates
let mut keys: Vec<Key> = Vec::with_capacity(files.len());
let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
let mut keys: Vec<Key> = vec![];
let mut lsns: Vec<Lsn> = vec![];
for Layer {
key_range: keyr,
lsn_range: lsnr,
@@ -182,8 +131,6 @@ pub fn main() -> Result<()> {
lsns.push(lsnr.end);
}
lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
// Analyze
let key_map = build_coordinate_compression_map(keys);
let lsn_map = build_coordinate_compression_map(lsns);
@@ -197,13 +144,10 @@ pub fn main() -> Result<()> {
println!(
"{}",
BeginSvg {
w: (key_map.len() + 10) as f32,
w: key_map.len() as f32,
h: stretch * lsn_map.len() as f32
}
);
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
for Layer {
filename,
key_range: keyr,
@@ -225,6 +169,7 @@ pub fn main() -> Result<()> {
let mut lsn_diff = (lsn_end - lsn_start) as f32;
let mut fill = Fill::None;
let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
let mut lsn_offset = 0.0;
// Fill in and thicken rectangle if it's an
@@ -244,7 +189,7 @@ pub fn main() -> Result<()> {
println!(
" {}",
rectangle(
5.0 + key_start as f32 + stretch * xmargin,
key_start as f32 + stretch * xmargin,
stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
key_diff as f32 - stretch * 2.0 * xmargin,
stretch * (lsn_diff - 2.0 * ymargin)
@@ -255,26 +200,6 @@ pub fn main() -> Result<()> {
.comment(filename)
);
}
for (lsn, kind) in lines {
let lsn_start = *lsn_map.get(&lsn).unwrap();
let lsn_end = lsn_start;
let stretch = 2.0;
let lsn_diff = 0.3;
let lsn_offset = -lsn_diff / 2.0;
let ymargin = 0.05;
println!(
"{}",
rectangle(
0.0f32 + stretch * xmargin,
stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
(key_map.len() + 10) as f32,
stretch * (lsn_diff - 2.0 * ymargin)
)
.fill(kind)
);
}
println!("{}", EndSvg);
eprintln!("num_images: {}", num_images);

View File

@@ -3,7 +3,7 @@ use std::collections::HashMap;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
use pageserver::tenant::storage_layer::LayerName;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
#[derive(serde::Serialize)]
struct Output<'a> {
layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
disk_consistent_lsn: Lsn,
timeline_metadata: &'a TimelineMetadata,
}

View File

@@ -1,4 +1,3 @@
use bytes::{Buf, BufMut, Bytes};
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
use tracing::warn;
@@ -62,84 +61,6 @@ pub fn encode_aux_file_key(path: &str) -> Key {
}
}
const AUX_FILE_ENCODING_VERSION: u8 = 0x01;
pub fn decode_file_value(val: &[u8]) -> anyhow::Result<Vec<(&str, &[u8])>> {
let mut ptr = val;
if ptr.is_empty() {
// empty value = no files
return Ok(Vec::new());
}
assert_eq!(
ptr.get_u8(),
AUX_FILE_ENCODING_VERSION,
"unsupported aux file value"
);
let mut files = vec![];
while ptr.has_remaining() {
let key_len = ptr.get_u32() as usize;
let key = &ptr[..key_len];
ptr.advance(key_len);
let val_len = ptr.get_u32() as usize;
let content = &ptr[..val_len];
ptr.advance(val_len);
let path = std::str::from_utf8(key)?;
files.push((path, content));
}
Ok(files)
}
/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference
/// to the original value slice. Be cautious about memory consumption.
pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result<Vec<(String, Bytes)>> {
let mut ptr = val.clone();
if ptr.is_empty() {
// empty value = no files
return Ok(Vec::new());
}
assert_eq!(
ptr.get_u8(),
AUX_FILE_ENCODING_VERSION,
"unsupported aux file value"
);
let mut files = vec![];
while ptr.has_remaining() {
let key_len = ptr.get_u32() as usize;
let key = ptr.slice(..key_len);
ptr.advance(key_len);
let val_len = ptr.get_u32() as usize;
let content = ptr.slice(..val_len);
ptr.advance(val_len);
let path = std::str::from_utf8(&key)?.to_string();
files.push((path, content));
}
Ok(files)
}
pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
if files.is_empty() {
// no files = empty value
return Ok(Vec::new());
}
let mut encoded = vec![];
encoded.put_u8(AUX_FILE_ENCODING_VERSION);
for (path, content) in files {
if path.len() > u32::MAX as usize {
anyhow::bail!("{} exceeds path size limit", path);
}
encoded.put_u32(path.len() as u32);
encoded.put_slice(path.as_bytes());
if content.len() > u32::MAX as usize {
anyhow::bail!("{} exceeds content size limit", path);
}
encoded.put_u32(content.len() as u32);
encoded.put_slice(content);
}
Ok(encoded)
}
#[cfg(test)]
mod tests {
use super::*;
@@ -188,21 +109,4 @@ mod tests {
encode_aux_file_key("other_file_not_supported").to_string()
);
}
#[test]
fn test_value_encoding() {
let files = vec![
("pg_logical/1.file", "1111".as_bytes()),
("pg_logical/2.file", "2222".as_bytes()),
];
assert_eq!(
files,
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
);
let files = vec![];
assert_eq!(
files,
decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
);
}
}

View File

@@ -601,7 +601,7 @@ where
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
if self.timeline.is_ancestor_lsn(self.lsn) {
if self.lsn == self.timeline.get_ancestor_lsn() {
write!(zenith_signal, "PREV LSN: none")
.map_err(|e| BasebackupError::Server(e.into()))?;
} else {

View File

@@ -3,7 +3,6 @@
//! Main entry point for the Page Server executable.
use std::env::{var, VarError};
use std::io::Read;
use std::sync::Arc;
use std::time::Duration;
use std::{env, ops::ControlFlow, str::FromStr};
@@ -152,34 +151,37 @@ fn initialize_config(
workdir: &Utf8Path,
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
let init = arg_matches.get_flag("init");
let update_config = init || arg_matches.get_flag("update-config");
let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
Ok(mut f) => {
if init {
anyhow::bail!("config file already exists: {cfg_file_path}");
}
let md = f.metadata().context("stat config file")?;
if md.is_file() {
let mut s = String::new();
f.read_to_string(&mut s).context("read config file")?;
Some(s.parse().context("parse config file toml")?)
} else {
anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
}
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
Err(e) => {
anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
if init {
anyhow::bail!(
"Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
);
}
// Supplement the CLI arguments with the config file
let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
.with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
(
cfg_file_contents
.parse::<toml_edit::Document>()
.with_context(|| {
format!("Failed to parse '{cfg_file_path}' as pageserver config")
})?,
true,
)
} else if cfg_file_path.exists() {
anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
} else {
// We're initializing the tenant, so there's no config file yet
(
DEFAULT_CONFIG_FILE
.parse::<toml_edit::Document>()
.context("could not parse built-in config file")?,
false,
)
};
let mut effective_config = file_contents.unwrap_or_else(|| {
DEFAULT_CONFIG_FILE
.parse()
.expect("unit tests ensure this works")
});
// Patch with overrides from the command line
if let Some(values) = arg_matches.get_many::<String>("config-override") {
for option_line in values {
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
@@ -187,21 +189,22 @@ fn initialize_config(
})?;
for (key, item) in doc.iter() {
effective_config.insert(key, item.clone());
if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
}
toml.insert(key, item.clone());
}
}
}
debug!("Resulting toml: {effective_config}");
// Construct the runtime representation
let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
debug!("Resulting toml: {toml}");
let conf = PageServerConf::parse_and_validate(&toml, workdir)
.context("Failed to parse pageserver configuration")?;
if init {
if update_config {
info!("Writing pageserver config to '{cfg_file_path}'");
std::fs::write(cfg_file_path, effective_config.to_string())
std::fs::write(cfg_file_path, toml.to_string())
.with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
info!("Config successfully written to '{cfg_file_path}'")
}
@@ -755,13 +758,18 @@ fn cli() -> Command {
// See `settings.md` for more details on the extra configuration patameters pageserver can process
.arg(
Arg::new("config-override")
.long("config-override")
.short('c')
.num_args(1)
.action(ArgAction::Append)
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
)
.arg(
Arg::new("update-config")
.long("update-config")
.action(ArgAction::SetTrue)
.help("Update the config file when started"),
)
.arg(
Arg::new("enabled-features")
.long("enabled-features")

View File

@@ -38,7 +38,7 @@ use deleter::DeleterMessage;
use list_writer::ListWriterQueueMessage;
use validator::ValidatorQueueMessage;
use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
// TODO: configurable for how long to wait before executing deletions
@@ -479,7 +479,7 @@ impl DeletionQueueClient {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
) -> Result<(), DeletionQueueError> {
if current_generation.is_none() {
debug!("Enqueuing deletions in legacy mode, skipping queue");
@@ -511,7 +511,7 @@ impl DeletionQueueClient {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
) -> Result<(), DeletionQueueError> {
metrics::DELETION_QUEUE
.keys_submitted
@@ -734,20 +734,20 @@ mod test {
use crate::{
control_plane_client::RetryForeverError,
repository::Key,
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
};
use super::*;
pub const TIMELINE_ID: TimelineId =
TimelineId::from_array(hex!("11223344556677881122334455667788"));
pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
});
// When you need a second layer in a test.
pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
});
@@ -797,7 +797,7 @@ mod test {
/// Returns remote layer file name, suitable for use in assert_remote_files
fn write_remote_layer(
&self,
file_name: LayerName,
file_name: LayerFileName,
gen: Generation,
) -> anyhow::Result<String> {
let tenant_shard_id = self.harness.tenant_shard_id;
@@ -952,7 +952,7 @@ mod test {
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;
let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let tenant_shard_id = ctx.harness.tenant_shard_id;
let content: Vec<u8> = "victim1 contents".into();

View File

@@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::LayerFileMetadata;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::storage_layer::LayerFileName;
use crate::virtual_file::on_fatal_io_error;
use crate::virtual_file::MaybeFatalIo;
@@ -59,7 +59,7 @@ pub(super) struct DeletionOp {
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
// to do it for you.
pub(super) layers: Vec<(LayerName, LayerFileMetadata)>,
pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant shard attachment in which we are enqueuing

View File

@@ -64,7 +64,7 @@ use crate::{
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
},
};
@@ -540,12 +540,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
js.spawn(async move {
layer
.secondary_tenant
.evict_layer(
tenant_manager.get_conf(),
layer.timeline_id,
layer.name,
layer.metadata,
)
.evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
.await;
Ok(file_size)
});
@@ -604,7 +599,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
pub(crate) struct EvictionSecondaryLayer {
pub(crate) secondary_tenant: Arc<SecondaryTenant>,
pub(crate) timeline_id: TimelineId,
pub(crate) name: LayerName,
pub(crate) name: LayerFileName,
pub(crate) metadata: LayerFileMetadata,
}
@@ -637,9 +632,9 @@ impl EvictionLayer {
}
}
pub(crate) fn get_name(&self) -> LayerName {
pub(crate) fn get_name(&self) -> LayerFileName {
match self {
Self::Attached(l) => l.layer_desc().layer_name(),
Self::Attached(l) => l.layer_desc().filename(),
Self::Secondary(sl) => sl.name.clone(),
}
}

View File

@@ -420,6 +420,25 @@ paths:
description: Tenant scheduled to load successfully
/v1/tenant/{tenant_id}/synthetic_size:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
get:
description: |
Calculate tenant's synthetic size
responses:
"200":
description: Tenant's synthetic size
content:
application/json:
schema:
$ref: "#/components/schemas/SyntheticSizeResponse"
# This route has no handler. TODO: remove?
/v1/tenant/{tenant_id}/size:
parameters:
- name: tenant_id
in: path
@@ -449,9 +468,19 @@ paths:
content:
application/json:
schema:
$ref: "#/components/schemas/SyntheticSizeResponse"
text/html:
description: SVG representation of the tenant and it's timelines.
type: object
required:
- id
- size
properties:
id:
type: string
format: hex
size:
type: integer
nullable: true
description: |
Size metric in bytes or null if inputs_only=true was given.
"401":
description: Unauthorized Error
content:
@@ -900,9 +929,6 @@ components:
format: hex
size:
type: integer
nullable: true
description: |
Size metric in bytes or null if inputs_only=true was given.
segment_sizes:
type: array
items:

View File

@@ -63,7 +63,6 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::Timeline;
use crate::tenant::SpawnMode;
@@ -1229,15 +1228,13 @@ async fn layer_download_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let layer_file_name = get_request_param(&request, "layer_file_name")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let layer_name = LayerName::from_str(layer_file_name)
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let downloaded = timeline
.download_layer(&layer_name)
.download_layer(layer_file_name)
.await
.map_err(ApiError::InternalServerError)?;
@@ -1261,14 +1258,11 @@ async fn evict_timeline_layer_handler(
let layer_file_name = get_request_param(&request, "layer_file_name")?;
let state = get_state(&request);
let layer_name = LayerName::from_str(layer_file_name)
.map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let evicted = timeline
.evict_layer(&layer_name)
.evict_layer(layer_file_name)
.await
.map_err(ApiError::InternalServerError)?;
@@ -1833,75 +1827,6 @@ async fn timeline_download_remote_layers_handler_get(
json_response(StatusCode::OK, info)
}
async fn timeline_detach_ancestor_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
use crate::tenant::timeline::detach_ancestor::Options;
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
async move {
let mut options = Options::default();
let rewrite_concurrency =
parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
let copy_concurrency =
parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?;
[
(&mut options.rewrite_concurrency, rewrite_concurrency),
(&mut options.copy_concurrency, copy_concurrency),
]
.into_iter()
.filter_map(|(target, val)| val.map(|val| (target, val)))
.for_each(|(target, val)| *target = val);
let state = get_state(&request);
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
let ctx = &ctx;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))?;
let (_guard, prepared) = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
let res = state
.tenant_manager
.complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
.await;
match res {
Ok(reparented_timelines) => {
let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
reparented_timelines,
};
json_response(StatusCode::OK, resp)
}
Err(e) => Err(ApiError::InternalServerError(
e.context("timeline detach completion"),
)),
}
}
.instrument(span)
.await
}
async fn deletion_queue_flush(
r: Request<Body>,
cancel: CancellationToken,
@@ -2590,10 +2515,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_get),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor",
|r| api_handler(r, timeline_detach_ancestor_handler),
)
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_delete_handler)
})

View File

@@ -1512,80 +1512,29 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
});
pub(crate) struct TenantManagerMetrics {
tenant_slots_attached: UIntGauge,
tenant_slots_secondary: UIntGauge,
tenant_slots_inprogress: UIntGauge,
pub(crate) tenant_slots: UIntGauge,
pub(crate) tenant_slot_writes: IntCounter,
pub(crate) unexpected_errors: IntCounter,
}
impl TenantManagerMetrics {
/// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects
/// exactly: they track the lifetime of the slots _in the tenant map_.
pub(crate) fn slot_inserted(&self, slot: &TenantSlot) {
match slot {
TenantSlot::Attached(_) => {
self.tenant_slots_attached.inc();
}
TenantSlot::Secondary(_) => {
self.tenant_slots_secondary.inc();
}
TenantSlot::InProgress(_) => {
self.tenant_slots_inprogress.inc();
}
}
}
pub(crate) fn slot_removed(&self, slot: &TenantSlot) {
match slot {
TenantSlot::Attached(_) => {
self.tenant_slots_attached.dec();
}
TenantSlot::Secondary(_) => {
self.tenant_slots_secondary.dec();
}
TenantSlot::InProgress(_) => {
self.tenant_slots_inprogress.dec();
}
}
}
#[cfg(all(debug_assertions, not(test)))]
pub(crate) fn slots_total(&self) -> u64 {
self.tenant_slots_attached.get()
+ self.tenant_slots_secondary.get()
+ self.tenant_slots_inprogress.get()
}
}
pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
let tenant_slots = register_uint_gauge_vec!(
TenantManagerMetrics {
tenant_slots: register_uint_gauge!(
"pageserver_tenant_manager_slots",
"How many slots currently exist, including all attached, secondary and in-progress operations",
&["mode"]
)
.expect("failed to define a metric");
TenantManagerMetrics {
tenant_slots_attached: tenant_slots
.get_metric_with_label_values(&["attached"])
.unwrap(),
tenant_slots_secondary: tenant_slots
.get_metric_with_label_values(&["secondary"])
.unwrap(),
tenant_slots_inprogress: tenant_slots
.get_metric_with_label_values(&["inprogress"])
.unwrap(),
tenant_slot_writes: register_int_counter!(
"pageserver_tenant_manager_slot_writes",
"Writes to a tenant slot, including all of create/attach/detach/delete"
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_tenant_manager_unexpected_errors_total",
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
)
.expect("failed to define a metric"),
}
.expect("failed to define a metric"),
tenant_slot_writes: register_int_counter!(
"pageserver_tenant_manager_slot_writes",
"Writes to a tenant slot, including all of create/attach/detach/delete"
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_tenant_manager_unexpected_errors_total",
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
)
.expect("failed to define a metric"),
}
});
pub(crate) struct DeletionQueueMetrics {
@@ -2326,7 +2275,6 @@ use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
use crate::tenant::mgr::TenantSlot;
/// Maintain a per timeline gauge in addition to the global gauge.
struct PerTimelineRemotePhysicalSizeGauge {
@@ -2929,8 +2877,6 @@ pub fn preinitialize_metrics() {
&WALRECEIVER_CANDIDATES_REMOVED,
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
&REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
&REMOTE_ONDEMAND_DOWNLOADED_BYTES,
]
.into_iter()
.for_each(|c| {

View File

@@ -1384,7 +1384,6 @@ impl PageServerHandler {
}
}
#[async_trait::async_trait]
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,

View File

@@ -10,9 +10,9 @@ use super::tenant::{PageReconstructError, Timeline};
use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::WAL_INGEST;
use crate::repository::*;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
use crate::walrecord::NeonWalRecord;
use crate::{aux_file, repository::*};
use anyhow::{ensure, Context};
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
@@ -24,7 +24,6 @@ use pageserver_api::key::{
AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -671,7 +670,7 @@ impl Timeline {
self.get(CHECKPOINT_KEY, lsn, ctx).await
}
async fn list_aux_files_v1(
pub(crate) async fn list_aux_files(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -689,63 +688,6 @@ impl Timeline {
}
}
async fn list_aux_files_v2(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
let kv = self
.scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
.await
.context("scan")?;
let mut result = HashMap::new();
for (_, v) in kv {
let v = v.context("get value")?;
let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
for (fname, content) in v {
result.insert(fname, content);
}
}
Ok(result)
}
pub(crate) async fn list_aux_files(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
match self.get_switch_aux_file_policy() {
AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
AuxFilePolicy::CrossValidation => {
let v1_result = self.list_aux_files_v1(lsn, ctx).await;
let v2_result = self.list_aux_files_v2(lsn, ctx).await;
match (v1_result, v2_result) {
(Ok(v1), Ok(v2)) => {
if v1 != v2 {
tracing::error!(
"unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
);
return Err(PageReconstructError::Other(anyhow::anyhow!(
"unmatched aux file v1 v2 result"
)));
}
Ok(v1)
}
(Ok(_), Err(v2)) => {
tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
Err(v2)
}
(Err(v1), Ok(_)) => {
tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
Err(v1)
}
(Err(_), Err(v2)) => Err(v2),
}
}
}
}
/// Does the same as get_current_logical_size but counted on demand.
/// Used to initialize the logical size tracking on startup.
///
@@ -1447,9 +1389,6 @@ impl<'a> DatadirModification<'a> {
}
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
return Ok(());
}
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
@@ -1465,122 +1404,90 @@ impl<'a> DatadirModification<'a> {
content: &[u8],
ctx: &RequestContext,
) -> anyhow::Result<()> {
let policy = self.tline.get_switch_aux_file_policy();
if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
let key = aux_file::encode_aux_file_key(path);
// retrieve the key from the engine
let old_val = match self.get(key, ctx).await {
Ok(val) => Some(val),
Err(PageReconstructError::MissingKey(_)) => None,
Err(e) => return Err(e.into()),
};
let files = if let Some(ref old_val) = old_val {
aux_file::decode_file_value(old_val)?
} else {
Vec::new()
};
let new_files = if content.is_empty() {
files
.into_iter()
.filter(|(p, _)| &path != p)
.collect::<Vec<_>>()
} else {
files
.into_iter()
.filter(|(p, _)| &path != p)
.chain(std::iter::once((path, content)))
.collect::<Vec<_>>()
};
let new_val = aux_file::encode_file_value(&new_files)?;
self.put(key, Value::Image(new_val.into()));
}
let file_path = path.to_string();
let content = if content.is_empty() {
None
} else {
Some(Bytes::copy_from_slice(content))
};
if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
let file_path = path.to_string();
let content = if content.is_empty() {
None
let n_files;
let mut aux_files = self.tline.aux_files.lock().await;
if let Some(mut dir) = aux_files.dir.take() {
// We already updated aux files in `self`: emit a delta and update our latest value.
dir.upsert(file_path.clone(), content.clone());
n_files = dir.files.len();
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
self.put(
AUX_FILES_KEY,
Value::Image(Bytes::from(
AuxFilesDirectory::ser(&dir).context("serialize")?,
)),
);
aux_files.n_deltas = 0;
} else {
Some(Bytes::copy_from_slice(content))
};
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
);
aux_files.n_deltas += 1;
}
aux_files.dir = Some(dir);
} else {
// Check if the AUX_FILES_KEY is initialized
match self.get(AUX_FILES_KEY, ctx).await {
Ok(dir_bytes) => {
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
// Key is already set, we may append a delta
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile {
file_path: file_path.clone(),
content: content.clone(),
}),
);
dir.upsert(file_path, content);
n_files = dir.files.len();
aux_files.dir = Some(dir);
}
Err(
e @ (PageReconstructError::AncestorStopping(_)
| PageReconstructError::Cancelled
| PageReconstructError::AncestorLsnTimeout(_)),
) => {
// Important that we do not interpret a shutdown error as "not found" and thereby
// reset the map.
return Err(e.into());
}
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
// the same for now, though in theory, we should only match the `MissingKey` variant.
Err(
PageReconstructError::Other(_)
| PageReconstructError::WalRedo(_)
| PageReconstructError::MissingKey { .. },
) => {
// Key is missing, we must insert an image as the basis for subsequent deltas.
let n_files;
let mut aux_files = self.tline.aux_files.lock().await;
if let Some(mut dir) = aux_files.dir.take() {
// We already updated aux files in `self`: emit a delta and update our latest value.
dir.upsert(file_path.clone(), content.clone());
n_files = dir.files.len();
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
let mut dir = AuxFilesDirectory {
files: HashMap::new(),
};
dir.upsert(file_path, content);
self.put(
AUX_FILES_KEY,
Value::Image(Bytes::from(
AuxFilesDirectory::ser(&dir).context("serialize")?,
)),
);
aux_files.n_deltas = 0;
} else {
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
);
aux_files.n_deltas += 1;
}
aux_files.dir = Some(dir);
} else {
// Check if the AUX_FILES_KEY is initialized
match self.get(AUX_FILES_KEY, ctx).await {
Ok(dir_bytes) => {
let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
// Key is already set, we may append a delta
self.put(
AUX_FILES_KEY,
Value::WalRecord(NeonWalRecord::AuxFile {
file_path: file_path.clone(),
content: content.clone(),
}),
);
dir.upsert(file_path, content);
n_files = dir.files.len();
aux_files.dir = Some(dir);
}
Err(
e @ (PageReconstructError::AncestorStopping(_)
| PageReconstructError::Cancelled
| PageReconstructError::AncestorLsnTimeout(_)),
) => {
// Important that we do not interpret a shutdown error as "not found" and thereby
// reset the map.
return Err(e.into());
}
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
// the same for now, though in theory, we should only match the `MissingKey` variant.
Err(
PageReconstructError::Other(_)
| PageReconstructError::WalRedo(_)
| PageReconstructError::MissingKey { .. },
) => {
// Key is missing, we must insert an image as the basis for subsequent deltas.
let mut dir = AuxFilesDirectory {
files: HashMap::new(),
};
dir.upsert(file_path, content);
self.put(
AUX_FILES_KEY,
Value::Image(Bytes::from(
AuxFilesDirectory::ser(&dir).context("serialize")?,
)),
);
n_files = 1;
aux_files.dir = Some(dir);
}
n_files = 1;
aux_files.dir = Some(dir);
}
}
self.pending_directory_entries
.push((DirectoryKind::AuxFiles, n_files));
}
self.pending_directory_entries
.push((DirectoryKind::AuxFiles, n_files));
Ok(())
}

View File

@@ -33,6 +33,7 @@ impl Value {
}
}
#[cfg(test)]
#[derive(Debug, PartialEq)]
pub(crate) enum InvalidInput {
TooShortValue,
@@ -41,8 +42,10 @@ pub(crate) enum InvalidInput {
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
/// use this type for querying if a slice looks some particular way.
#[cfg(test)]
pub(crate) struct ValueBytes;
#[cfg(test)]
impl ValueBytes {
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
if raw.len() < 12 {

View File

@@ -319,9 +319,6 @@ pub enum TaskKind {
// Eviction. One per timeline.
Eviction,
// Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
IngestHousekeeping,
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
@@ -370,8 +367,6 @@ pub enum TaskKind {
#[cfg(test)]
UnitTest,
DetachAncestor,
}
#[derive(Default)]

View File

@@ -322,9 +322,6 @@ pub struct Tenant {
/// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
pub(crate) timeline_get_throttle:
Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
}
impl std::fmt::Debug for Tenant {
@@ -1679,34 +1676,6 @@ impl Tenant {
Ok(())
}
// Call through to all timelines to freeze ephemeral layers if needed. Usually
// this happens during ingest: this background housekeeping is for freezing layers
// that are open but haven't been written to for some time.
async fn ingest_housekeeping(&self) {
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
// compactions. We don't want to block everything else while the
// compaction runs.
let timelines = {
self.timelines
.lock()
.unwrap()
.values()
.filter_map(|timeline| {
if timeline.is_active() {
Some(timeline.clone())
} else {
None
}
})
.collect::<Vec<_>>()
};
for timeline in &timelines {
timeline.maybe_freeze_ephemeral_layer().await;
}
}
pub fn current_state(&self) -> TenantState {
self.state.borrow().clone()
}
@@ -2560,7 +2529,6 @@ impl Tenant {
&crate::metrics::tenant_throttling::TIMELINE_GET,
)),
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
ongoing_timeline_detach: std::sync::Mutex::default(),
}
}
@@ -3758,7 +3726,7 @@ pub(crate) mod harness {
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
switch_to_aux_file_v2: Some(tenant_conf.switch_to_aux_file_v2),
}
}
}

View File

@@ -9,7 +9,6 @@
//! may lead to a data loss.
//!
use anyhow::bail;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::CompactionAlgorithm;
use pageserver_api::models::EvictionPolicy;
use pageserver_api::models::{self, ThrottleConfig};
@@ -371,9 +370,9 @@ pub struct TenantConf {
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
/// Switch to aux file v2. Switching this flag requires the user has not written any aux file into
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
pub switch_aux_file_policy: AuxFilePolicy,
pub switch_to_aux_file_v2: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -472,7 +471,7 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub switch_aux_file_policy: Option<AuxFilePolicy>,
pub switch_to_aux_file_v2: Option<bool>,
}
impl TenantConfOpt {
@@ -530,9 +529,9 @@ impl TenantConfOpt {
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
switch_aux_file_policy: self
.switch_aux_file_policy
.unwrap_or(global_conf.switch_aux_file_policy),
switch_to_aux_file_v2: self
.switch_to_aux_file_v2
.unwrap_or(global_conf.switch_to_aux_file_v2),
}
}
}
@@ -574,7 +573,7 @@ impl Default for TenantConf {
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
switch_aux_file_policy: AuxFilePolicy::V1,
switch_to_aux_file_v2: false,
}
}
}
@@ -649,7 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
switch_aux_file_policy: value.switch_aux_file_policy,
switch_to_aux_file_v2: value.switch_to_aux_file_v2,
}
}
}

View File

@@ -585,20 +585,9 @@ impl DeleteTenantFlow {
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
// Update stats
match &removed {
TenantsMapRemoveResult::Occupied(slot) => {
crate::metrics::TENANT_MANAGER.slot_removed(slot);
}
TenantsMapRemoveResult::InProgress(barrier) => {
crate::metrics::TENANT_MANAGER
.slot_removed(&TenantSlot::InProgress(barrier.clone()));
}
TenantsMapRemoveResult::Vacant => {
// Nothing changed in map, no metric update
}
}
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
match removed {
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {

View File

@@ -207,24 +207,6 @@ impl TimelineMetadata {
self.body.ancestor_lsn
}
/// When reparenting, the `ancestor_lsn` does not change.
pub fn reparent(&mut self, timeline: &TimelineId) {
assert!(self.body.ancestor_timeline.is_some());
// no assertion for redoing this: it's fine, we may have to repeat this multiple times over
self.body.ancestor_timeline = Some(*timeline);
}
pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
if let Some(ancestor) = self.body.ancestor_timeline {
assert_eq!(ancestor, branchpoint.0);
}
if self.body.ancestor_lsn != Lsn(0) {
assert_eq!(self.body.ancestor_lsn, branchpoint.1);
}
self.body.ancestor_timeline = None;
self.body.ancestor_lsn = Lsn(0);
}
pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
self.body.latest_gc_cutoff_lsn
}

View File

@@ -56,7 +56,6 @@ use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::secondary::SecondaryTenant;
use super::timeline::detach_ancestor::PreparedTimelineDetach;
use super::TenantSharedResources;
/// For a tenant that appears in TenantsMap, it may either be
@@ -247,7 +246,6 @@ impl TenantsMap {
}
}
#[cfg(all(debug_assertions, not(test)))]
pub(crate) fn len(&self) -> usize {
match self {
TenantsMap::Initializing => 0,
@@ -748,7 +746,6 @@ pub async fn init_tenant_mgr(
}
};
METRICS.slot_inserted(&slot);
tenants.insert(tenant_shard_id, slot);
}
@@ -756,7 +753,7 @@ pub async fn init_tenant_mgr(
let mut tenants_map = TENANTS.write().unwrap();
assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
METRICS.tenant_slots.set(tenants.len() as u64);
*tenants_map = TenantsMap::Open(tenants);
Ok(TenantManager {
@@ -827,14 +824,6 @@ fn tenant_spawn(
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
let mut join_set = JoinSet::new();
#[cfg(all(debug_assertions, not(test)))]
{
// Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check,
// as it happens implicitly at the end of tests etc.
let m = tenants.read().unwrap();
debug_assert_eq!(METRICS.slots_total(), m.len() as u64);
}
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
let (total_in_progress, total_attached) = {
let mut m = tenants.write().unwrap();
@@ -2008,101 +1997,6 @@ impl TenantManager {
})
.collect())
}
/// Completes an earlier prepared timeline detach ancestor.
pub(crate) async fn complete_detaching_timeline_ancestor(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
prepared: PreparedTimelineDetach,
ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
struct RevertOnDropSlot(Option<SlotGuard>);
impl Drop for RevertOnDropSlot {
fn drop(&mut self) {
if let Some(taken) = self.0.take() {
taken.revert();
}
}
}
impl RevertOnDropSlot {
fn into_inner(mut self) -> SlotGuard {
self.0.take().unwrap()
}
}
impl std::ops::Deref for RevertOnDropSlot {
type Target = SlotGuard;
fn deref(&self) -> &Self::Target {
self.0.as_ref().unwrap()
}
}
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
let slot_guard = RevertOnDropSlot(Some(slot_guard));
let tenant = {
let Some(old_slot) = slot_guard.get_old_value() else {
anyhow::bail!(
"Tenant not found when trying to complete detaching timeline ancestor"
);
};
let Some(tenant) = old_slot.get_attached() else {
anyhow::bail!("Tenant is not in attached state");
};
if !tenant.is_active() {
anyhow::bail!("Tenant is not active");
}
tenant.clone()
};
let timeline = tenant.get_timeline(timeline_id, true)?;
let reparented = timeline
.complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
.await?;
let mut slot_guard = slot_guard.into_inner();
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {
slot_guard.drop_old_value()?;
}
Err(_barrier) => {
slot_guard.revert();
// this really should not happen, at all, unless shutdown was already going?
anyhow::bail!("Cannot restart Tenant, already shutting down");
}
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
let shard_identity = config.shard;
let tenant = tenant_spawn(
self.conf,
tenant_shard_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(config)?,
shard_identity,
None,
self.tenants,
SpawnMode::Eager,
ctx,
)?;
slot_guard.upsert(TenantSlot::Attached(tenant))?;
Ok(reparented)
}
}
#[derive(Debug, thiserror::Error)]
@@ -2534,13 +2428,10 @@ impl SlotGuard {
TenantsMap::Open(m) => m,
};
METRICS.slot_inserted(&new_value);
let replaced = m.insert(self.tenant_shard_id, new_value);
self.upserted = true;
if let Some(replaced) = replaced.as_ref() {
METRICS.slot_removed(replaced);
}
METRICS.tenant_slots.set(m.len() as u64);
replaced
};
@@ -2650,13 +2541,9 @@ impl Drop for SlotGuard {
}
if self.old_value_is_shutdown() {
METRICS.slot_removed(entry.get());
entry.remove();
} else {
let inserting = self.old_value.take().unwrap();
METRICS.slot_inserted(&inserting);
let replaced = entry.insert(inserting);
METRICS.slot_removed(&replaced);
entry.insert(self.old_value.take().unwrap());
}
}
Entry::Vacant(_) => {
@@ -2667,6 +2554,8 @@ impl Drop for SlotGuard {
);
}
}
METRICS.tenant_slots.set(m.len() as u64);
}
}
@@ -2746,9 +2635,7 @@ fn tenant_map_acquire_slot_impl(
}
_ => {
let (completion, barrier) = utils::completion::channel();
let inserting = TenantSlot::InProgress(barrier);
METRICS.slot_inserted(&inserting);
v.insert(inserting);
v.insert(TenantSlot::InProgress(barrier));
tracing::debug!("Vacant, inserted InProgress");
Ok(SlotGuard::new(*tenant_shard_id, None, completion))
}
@@ -2784,10 +2671,7 @@ fn tenant_map_acquire_slot_impl(
_ => {
// Happy case: the slot was not in any state that violated our mode
let (completion, barrier) = utils::completion::channel();
let in_progress = TenantSlot::InProgress(barrier);
METRICS.slot_inserted(&in_progress);
let old_value = o.insert(in_progress);
METRICS.slot_removed(&old_value);
let old_value = o.insert(TenantSlot::InProgress(barrier));
tracing::debug!("Occupied, replaced with InProgress");
Ok(SlotGuard::new(
*tenant_shard_id,

View File

@@ -240,7 +240,7 @@ use utils::id::{TenantId, TimelineId};
use self::index::IndexPart;
use super::metadata::MetadataUpdate;
use super::storage_layer::{Layer, LayerName, ResidentLayer};
use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
use super::upload_queue::SetDeletedFlagProgress;
use super::Generation;
@@ -437,19 +437,6 @@ impl RemoteTimelineClient {
}
}
/// Returns true if this timeline was previously detached at this Lsn and the remote timeline
/// client is currently initialized.
pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
// technically this is a dirty read, but given how timeline detach ancestor is implemented
// via tenant restart, the lineage has always been uploaded.
self.upload_queue
.lock()
.unwrap()
.initialized_mut()
.map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
.unwrap_or(false)
}
fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
current_remote_index_part
@@ -516,7 +503,7 @@ impl RemoteTimelineClient {
/// On success, returns the size of the downloaded file.
pub async fn download_layer_file(
&self,
layer_file_name: &LayerName,
layer_file_name: &LayerFileName,
layer_metadata: &LayerFileMetadata,
cancel: &CancellationToken,
ctx: &RequestContext,
@@ -583,7 +570,7 @@ impl RemoteTimelineClient {
// ahead of what's _actually_ on the remote during index upload.
upload_queue.latest_metadata = metadata.clone();
self.schedule_index_upload(upload_queue);
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
Ok(())
}
@@ -604,7 +591,7 @@ impl RemoteTimelineClient {
upload_queue.latest_metadata.apply(update);
self.schedule_index_upload(upload_queue);
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
Ok(())
}
@@ -624,14 +611,18 @@ impl RemoteTimelineClient {
let upload_queue = guard.initialized_mut()?;
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue);
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
}
Ok(())
}
/// Launch an index-file upload operation in the background (internal function)
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
fn schedule_index_upload(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
metadata: TimelineMetadata,
) {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
info!(
@@ -640,8 +631,12 @@ impl RemoteTimelineClient {
upload_queue.latest_files_changes_since_metadata_upload_scheduled,
);
let index_part = IndexPart::from(&*upload_queue);
let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -650,67 +645,9 @@ impl RemoteTimelineClient {
self.launch_queued_tasks(upload_queue);
}
pub(crate) async fn schedule_reparenting_and_wait(
self: &Arc<Self>,
new_parent: &TimelineId,
) -> anyhow::Result<()> {
// FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
// and reads the in-memory part we cannot do the detaching like this
let receiver = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
return Err(anyhow::anyhow!(
"cannot reparent without a current ancestor"
));
};
upload_queue.latest_metadata.reparent(new_parent);
upload_queue.latest_lineage.record_previous_ancestor(&prev);
self.schedule_index_upload(upload_queue);
self.schedule_barrier0(upload_queue)
};
Self::wait_completion0(receiver).await
}
/// Schedules uploading a new version of `index_part.json` with the given layers added,
/// detaching from ancestor and waits for it to complete.
///
/// This is used with `Timeline::detach_ancestor` functionality.
pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait(
self: &Arc<Self>,
layers: &[Layer],
adopted: (TimelineId, Lsn),
) -> anyhow::Result<()> {
let barrier = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.latest_metadata.detach_from_ancestor(&adopted);
upload_queue.latest_lineage.record_detaching(&adopted);
for layer in layers {
upload_queue
.latest_files
.insert(layer.layer_desc().layer_name(), layer.metadata());
}
self.schedule_index_upload(upload_queue);
let barrier = self.schedule_barrier0(upload_queue);
self.launch_queued_tasks(upload_queue);
barrier
};
Self::wait_completion0(barrier).await
}
/// Launch an upload operation in the background; the file is added to be included in next
/// `index_part.json` upload.
/// Launch an upload operation in the background.
///
pub(crate) fn schedule_layer_file_upload(
self: &Arc<Self>,
layer: ResidentLayer,
@@ -732,15 +669,13 @@ impl RemoteTimelineClient {
upload_queue
.latest_files
.insert(layer.layer_desc().layer_name(), metadata.clone());
.insert(layer.layer_desc().filename(), metadata.clone());
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
info!(
gen=?metadata.generation,
shard=?metadata.shard,
"scheduled layer file upload {layer}",
"scheduled layer file upload {layer} gen={:?} shard={:?}",
metadata.generation, metadata.shard
);
let op = UploadOp::UploadLayer(layer, metadata);
self.metric_begin(&op);
upload_queue.queued_operations.push_back(op);
@@ -756,7 +691,7 @@ impl RemoteTimelineClient {
/// successfully.
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerName],
names: &[LayerFileName],
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -784,7 +719,7 @@ impl RemoteTimelineClient {
// the layer files as "dangling". this is fine, at worst case we create work for the
// scrubber.
let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
let names = gc_layers.iter().map(|x| x.layer_desc().filename());
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
@@ -799,10 +734,14 @@ impl RemoteTimelineClient {
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
names: I,
) -> Vec<(LayerName, LayerFileMetadata)>
) -> Vec<(LayerFileName, LayerFileMetadata)>
where
I: IntoIterator<Item = LayerName>,
I: IntoIterator<Item = LayerFileName>,
{
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone();
// Decorate our list of names with each name's metadata, dropping
// names that are unexpectedly missing from our metadata. This metadata
// is later used when physically deleting layers, to construct key paths.
@@ -841,7 +780,7 @@ impl RemoteTimelineClient {
// index_part update, because that needs to be uploaded before we can actually delete the
// files.
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue);
self.schedule_index_upload(upload_queue, metadata);
}
with_metadata
@@ -851,7 +790,7 @@ impl RemoteTimelineClient {
/// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
pub(crate) fn schedule_deletion_of_unlinked(
self: &Arc<Self>,
layers: Vec<(LayerName, LayerFileMetadata)>,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
@@ -864,7 +803,7 @@ impl RemoteTimelineClient {
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
mut with_metadata: Vec<(LayerName, LayerFileMetadata)>,
mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
) {
// Filter out any layers which were not created by this tenant shard. These are
// layers that originate from some ancestor shard after a split, and may still
@@ -933,7 +872,7 @@ impl RemoteTimelineClient {
self.schedule_layer_file_upload0(upload_queue, layer.clone());
}
let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
let names = compacted_from.iter().map(|x| x.layer_desc().filename());
self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
self.launch_queued_tasks(upload_queue);
@@ -943,18 +882,12 @@ impl RemoteTimelineClient {
/// Wait for all previously scheduled uploads/deletions to complete
pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
let receiver = {
let mut receiver = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
self.schedule_barrier0(upload_queue)
};
Self::wait_completion0(receiver).await
}
async fn wait_completion0(
mut receiver: tokio::sync::watch::Receiver<()>,
) -> anyhow::Result<()> {
if receiver.changed().await.is_err() {
anyhow::bail!("wait_completion aborted because upload queue was stopped");
}
@@ -1070,7 +1003,8 @@ impl RemoteTimelineClient {
let deleted_at = Utc::now().naive_utc();
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
.context("IndexPart serialize")?;
index_part.deleted_at = Some(deleted_at);
index_part
};
@@ -1151,93 +1085,6 @@ impl RemoteTimelineClient {
Ok(())
}
/// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload.
///
/// This is not normally needed.
pub(crate) async fn upload_layer_file(
self: &Arc<Self>,
uploaded: &ResidentLayer,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
self.tenant_shard_id.to_index(),
&uploaded.layer_desc().layer_name(),
uploaded.metadata().generation,
);
backoff::retry(
|| async {
upload::upload_timeline_layer(
&self.storage_impl,
uploaded.local_path(),
&remote_path,
uploaded.metadata().file_size(),
cancel,
)
.await
},
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"upload a layer without adding it to latest files",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("upload a layer without adding it to latest files")
}
/// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is
/// not added to be part of a future `index_part.json` upload.
pub(crate) async fn copy_timeline_layer(
self: &Arc<Self>,
adopted: &Layer,
adopted_as: &Layer,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let source_remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&adopted
.get_timeline_id()
.expect("Source timeline should be alive"),
self.tenant_shard_id.to_index(),
&adopted.layer_desc().layer_name(),
adopted.metadata().generation,
);
let target_remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
self.tenant_shard_id.to_index(),
&adopted_as.layer_desc().layer_name(),
adopted_as.metadata().generation,
);
backoff::retry(
|| async {
upload::copy_timeline_layer(
&self.storage_impl,
&source_remote_path,
&target_remote_path,
cancel,
)
.await
},
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"copy timeline layer",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("remote copy timeline layer")
}
async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
match tokio::time::timeout(
DELETION_QUEUE_FLUSH_TIMEOUT,
@@ -1409,7 +1256,7 @@ impl RemoteTimelineClient {
while let Some(next_op) = upload_queue.queued_operations.front() {
// Can we run this task now?
let can_run_now = match next_op {
UploadOp::UploadLayer(..) => {
UploadOp::UploadLayer(_, _) => {
// Can always be scheduled.
true
}
@@ -1536,25 +1383,13 @@ impl RemoteTimelineClient {
let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
let local_path = layer.local_path();
// We should only be uploading layers created by this `Tenant`'s lifetime, so
// the metadata in the upload should always match our current generation.
assert_eq!(layer_metadata.generation, self.generation);
let remote_path = remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.timeline_id,
layer_metadata.shard,
&layer.layer_desc().layer_name(),
layer_metadata.generation,
);
let path = layer.local_path();
upload::upload_timeline_layer(
self.conf,
&self.storage_impl,
local_path,
&remote_path,
layer_metadata.file_size(),
path,
layer_metadata,
self.generation,
&self.cancel,
)
.measure_remote_op(
@@ -1830,7 +1665,6 @@ impl RemoteTimelineClient {
latest_files: initialized.latest_files.clone(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: initialized.latest_metadata.clone(),
latest_lineage: initialized.latest_lineage.clone(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: initialized
.visible_remote_consistent_lsn
@@ -1916,14 +1750,14 @@ pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
layer_file_name: &LayerName,
layer_file_name: &LayerFileName,
generation: Generation,
) -> RemotePath {
// Generation-aware key format
let path = format!(
"tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
shard.get_suffix(),
layer_file_name,
layer_file_name.file_name(),
generation.get_suffix()
);
@@ -1984,6 +1818,29 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
}
}
/// Files on the remote storage are stored with paths, relative to the workdir.
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
///
/// Errors if the path provided does not start from pageserver's workdir.
pub fn remote_path(
conf: &PageServerConf,
local_path: &Utf8Path,
generation: Generation,
) -> anyhow::Result<RemotePath> {
let stripped = local_path
.strip_prefix(&conf.workdir)
.context("Failed to strip workdir prefix")?;
let suffixed = format!("{0}{1}", stripped, generation.get_suffix());
RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| {
format!(
"to resolve remote part of path {:?} for base {:?}",
local_path, conf.workdir
)
})
}
#[cfg(test)]
mod tests {
use super::*;
@@ -1991,7 +1848,6 @@ mod tests {
context::RequestContext,
tenant::{
harness::{TenantHarness, TIMELINE_ID},
storage_layer::layer::local_layer_path,
Tenant, Timeline,
},
DEFAULT_PG_VERSION,
@@ -2020,8 +1876,8 @@ mod tests {
TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
}
fn assert_file_list(a: &HashSet<LayerName>, b: &[&str]) {
let mut avec: Vec<String> = a.iter().map(|x| x.to_string()).collect();
fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
avec.sort();
let mut bvec = b.to_vec();
@@ -2147,7 +2003,7 @@ mod tests {
.layer_metadata
.keys()
.map(|f| f.to_owned())
.collect::<HashSet<LayerName>>();
.collect::<HashSet<LayerFileName>>();
let initial_layer = {
assert!(initial_layers.len() == 1);
initial_layers.into_iter().next().unwrap()
@@ -2173,21 +2029,12 @@ mod tests {
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
]
.into_iter()
.map(|(name, contents): (LayerName, Vec<u8>)| {
let local_path = local_layer_path(
harness.conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&name,
&generation,
);
std::fs::write(&local_path, &contents).unwrap();
.map(|(name, contents): (LayerFileName, Vec<u8>)| {
std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();
Layer::for_resident(
harness.conf,
&timeline,
local_path,
name,
LayerFileMetadata::new(contents.len() as u64, generation, shard),
)
@@ -2254,9 +2101,9 @@ mod tests {
.map(|f| f.to_owned())
.collect(),
&[
&initial_layer.to_string(),
&layers[0].layer_desc().layer_name().to_string(),
&layers[1].layer_desc().layer_name().to_string(),
&initial_layer.file_name(),
&layers[0].layer_desc().filename().file_name(),
&layers[1].layer_desc().filename().file_name(),
],
);
assert_eq!(index_part.metadata, metadata);
@@ -2270,7 +2117,7 @@ mod tests {
// keep using schedule_layer_file_deletion because we don't have a way to wait for the
// spawn_blocking started by the drop.
client
.schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()])
.schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
.unwrap();
{
let mut guard = client.upload_queue.lock().unwrap();
@@ -2288,9 +2135,9 @@ mod tests {
}
assert_remote_files(
&[
&initial_layer.to_string(),
&layers[0].layer_desc().layer_name().to_string(),
&layers[1].layer_desc().layer_name().to_string(),
&initial_layer.file_name(),
&layers[0].layer_desc().filename().file_name(),
&layers[1].layer_desc().filename().file_name(),
"index_part.json",
],
&remote_timeline_dir,
@@ -2303,9 +2150,9 @@ mod tests {
assert_remote_files(
&[
&initial_layer.to_string(),
&layers[1].layer_desc().layer_name().to_string(),
&layers[2].layer_desc().layer_name().to_string(),
&initial_layer.file_name(),
&layers[1].layer_desc().filename().file_name(),
&layers[2].layer_desc().filename().file_name(),
"index_part.json",
],
&remote_timeline_dir,
@@ -2324,22 +2171,19 @@ mod tests {
..
} = TestSetup::new("metrics").await.unwrap();
let client = timeline.remote_client.as_ref().unwrap();
let timeline_path = harness.timeline_path(&TIMELINE_ID);
let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let local_path = local_layer_path(
harness.conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&layer_file_name_1,
&harness.generation,
);
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let content_1 = dummy_contents("foo");
std::fs::write(&local_path, &content_1).unwrap();
std::fs::write(
timeline_path.join(layer_file_name_1.file_name()),
&content_1,
)
.unwrap();
let layer_file_1 = Layer::for_resident(
harness.conf,
&timeline,
local_path,
layer_file_name_1.clone(),
LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
);
@@ -2408,7 +2252,12 @@ mod tests {
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
// An empty IndexPart, just sufficient to ensure deserialization will succeed
let example_index_part = IndexPart::example();
let example_metadata = TimelineMetadata::example();
let example_index_part = IndexPart::new(
HashMap::new(),
example_metadata.disk_consistent_lsn(),
example_metadata,
);
let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();

View File

@@ -21,8 +21,7 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::layer::local_layer_path;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::Generation;
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
use crate::TEMP_FILE_SUFFIX;
@@ -48,7 +47,7 @@ pub async fn download_layer_file<'a>(
storage: &'a GenericRemoteStorage,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
layer_file_name: &'a LayerName,
layer_file_name: &'a LayerFileName,
layer_metadata: &'a LayerFileMetadata,
cancel: &CancellationToken,
ctx: &RequestContext,
@@ -56,13 +55,7 @@ pub async fn download_layer_file<'a>(
debug_assert_current_span_has_tenant_and_timeline_id();
let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
let local_path = local_layer_path(
conf,
&tenant_shard_id,
&timeline_id,
layer_file_name,
&layer_metadata.generation,
);
let local_path = timeline_path.join(layer_file_name.file_name());
let remote_path = remote_layer_path(
&tenant_shard_id.tenant_id,

View File

@@ -6,10 +6,10 @@ use std::collections::HashMap;
use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};
use utils::id::TimelineId;
use utils::bin_ser::SerializeError;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::storage_layer::LayerName;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::upload_queue::UploadQueueInitialized;
use crate::tenant::Generation;
use pageserver_api::shard::ShardIndex;
@@ -76,7 +76,7 @@ pub struct IndexPart {
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
/// that latest version stores.
pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated for convenience when reading the serialized structure, but is
@@ -85,9 +85,6 @@ pub struct IndexPart {
#[serde(rename = "metadata_bytes")]
pub metadata: TimelineMetadata,
#[serde(default)]
pub(crate) lineage: Lineage,
}
impl IndexPart {
@@ -100,23 +97,22 @@ impl IndexPart {
/// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
/// is always generated from the keys of `layer_metadata`)
/// - 4: timeline_layers is fully removed.
/// - 5: lineage was added
const LATEST_VERSION: usize = 5;
const LATEST_VERSION: usize = 4;
// Versions we may see when reading from a bucket.
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];
pub const FILE_NAME: &'static str = "index_part.json";
fn new(
layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
pub fn new(
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
disk_consistent_lsn: Lsn,
metadata: TimelineMetadata,
lineage: Lineage,
) -> Self {
// Transform LayerFileMetadata into IndexLayerMetadata
let layer_metadata = layers_and_metadata
.iter()
.map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
.into_iter()
.map(|(k, v)| (k, IndexLayerMetadata::from(v)))
.collect();
Self {
@@ -125,7 +121,6 @@ impl IndexPart {
disk_consistent_lsn,
metadata,
deleted_at: None,
lineage,
}
}
@@ -146,26 +141,20 @@ impl IndexPart {
pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
serde_json::to_vec(self)
}
#[cfg(test)]
pub(crate) fn example() -> Self {
let example_metadata = TimelineMetadata::example();
Self::new(
&HashMap::new(),
example_metadata.disk_consistent_lsn(),
example_metadata,
Default::default(),
)
}
}
impl From<&UploadQueueInitialized> for IndexPart {
fn from(uq: &UploadQueueInitialized) -> Self {
let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
let metadata = uq.latest_metadata.clone();
let lineage = uq.latest_lineage.clone();
impl TryFrom<&UploadQueueInitialized> for IndexPart {
type Error = SerializeError;
Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let metadata = upload_queue.latest_metadata.clone();
Ok(Self::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata,
))
}
}
@@ -183,8 +172,8 @@ pub struct IndexLayerMetadata {
pub shard: ShardIndex,
}
impl From<&LayerFileMetadata> for IndexLayerMetadata {
fn from(other: &LayerFileMetadata) -> Self {
impl From<LayerFileMetadata> for IndexLayerMetadata {
fn from(other: LayerFileMetadata) -> Self {
IndexLayerMetadata {
file_size: other.file_size,
generation: other.generation,
@@ -193,76 +182,8 @@ impl From<&LayerFileMetadata> for IndexLayerMetadata {
}
}
/// Limited history of earlier ancestors.
///
/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly
/// reparented by having an later timeline be detached from it's ancestor.
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
pub(crate) struct Lineage {
/// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`].
#[serde(skip_serializing_if = "is_false", default)]
reparenting_history_truncated: bool,
/// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`]
///
/// These are stored in case we want to support WAL based DR on the timeline. There can be many
/// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings
/// after [`Self::original_ancestor`] has been set.
#[serde(skip_serializing_if = "Vec::is_empty", default)]
reparenting_history: Vec<TimelineId>,
/// The ancestor from which this timeline has been detached from and when.
///
/// If you are adding support for detaching from a hierarchy, consider changing the ancestry
/// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
#[serde(skip_serializing_if = "Option::is_none", default)]
original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
}
fn is_false(b: &bool) -> bool {
!b
}
impl Lineage {
const REMEMBER_AT_MOST: usize = 100;
pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
if self.reparenting_history.last() == Some(old_ancestor) {
// do not re-record it
return;
}
let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
self.reparenting_history_truncated |= drop_oldest;
if drop_oldest {
self.reparenting_history.remove(0);
}
self.reparenting_history.push(*old_ancestor);
}
pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
assert!(self.original_ancestor.is_none());
self.original_ancestor =
Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
}
/// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
/// to start a read/write primary at this lsn".
///
/// Returns true if the Lsn was previously a branch point.
pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
self.original_ancestor
.as_ref()
.is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use super::*;
#[test]
@@ -298,7 +219,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
lineage: Lineage::default(),
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -339,7 +259,6 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
lineage: Lineage::default(),
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -381,8 +300,7 @@ mod tests {
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
lineage: Lineage::default(),
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -427,7 +345,6 @@ mod tests {
])
.unwrap(),
deleted_at: None,
lineage: Lineage::default(),
};
let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -466,58 +383,11 @@ mod tests {
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
lineage: Lineage::default(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
#[test]
fn v5_indexpart_is_parsed() {
let example = r#"{
"version":5,
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1},
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}},
"disk_consistent_lsn":"0/15A7618",
"metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"lineage":{
"original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
"reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
}
}"#;
let expected = IndexPart {
version: 5,
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
file_size: 23289856,
generation: Generation::new(1),
shard: ShardIndex::unsharded(),
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
file_size: 1015808,
generation: Generation::new(1),
shard: ShardIndex::unsharded(),
})
]),
disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
deleted_at: None,
lineage: Lineage {
reparenting_history_truncated: false,
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
},
};
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
assert_eq!(part, expected);
}
fn parse_naive_datetime(s: &str) -> NaiveDateTime {
chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
}
}

View File

@@ -12,13 +12,18 @@ use tokio_util::sync::CancellationToken;
use utils::backoff;
use super::Generation;
use crate::tenant::remote_timeline_client::{
index::IndexPart, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path,
use crate::{
config::PageServerConf,
tenant::remote_timeline_client::{
index::IndexPart, remote_index_path, remote_initdb_archive_path,
remote_initdb_preserved_archive_path, remote_path,
},
};
use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
use remote_storage::{GenericRemoteStorage, TimeTravelError};
use utils::id::{TenantId, TimelineId};
use super::index::LayerFileMetadata;
use tracing::info;
/// Serializes and uploads the given index part data to the remote storage.
@@ -60,10 +65,11 @@ pub(crate) async fn upload_index_part<'a>(
///
/// On an error, bumps the retries count and reschedules the entire task.
pub(super) async fn upload_timeline_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
local_path: &'a Utf8Path,
remote_path: &'a RemotePath,
metadata_size: u64,
source_path: &'a Utf8Path,
known_metadata: &'a LayerFileMetadata,
generation: Generation,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
fail_point!("before-upload-layer", |_| {
@@ -72,7 +78,8 @@ pub(super) async fn upload_timeline_layer<'a>(
pausable_failpoint!("before-upload-layer-pausable");
let source_file_res = fs::File::open(&local_path).await;
let storage_path = remote_path(conf, source_path, generation)?;
let source_file_res = fs::File::open(&source_path).await;
let source_file = match source_file_res {
Ok(source_file) => source_file,
Err(e) if e.kind() == ErrorKind::NotFound => {
@@ -83,49 +90,34 @@ pub(super) async fn upload_timeline_layer<'a>(
// it has been written to disk yet.
//
// This is tested against `test_compaction_delete_before_upload`
info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
return Ok(());
}
Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?,
Err(e) => {
Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))?
}
};
let fs_size = source_file
.metadata()
.await
.with_context(|| format!("get the source file metadata for layer {local_path:?}"))?
.with_context(|| format!("get the source file metadata for layer {source_path:?}"))?
.len();
let metadata_size = known_metadata.file_size();
if metadata_size != fs_size {
bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}");
}
let fs_size = usize::try_from(fs_size)
.with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
storage
.upload(reader, fs_size, remote_path, None, cancel)
.upload(reader, fs_size, &storage_path, None, cancel)
.await
.with_context(|| format!("upload layer from local path '{local_path}'"))
}
pub(super) async fn copy_timeline_layer(
storage: &GenericRemoteStorage,
source_path: &RemotePath,
target_path: &RemotePath,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
fail_point!("before-copy-layer", |_| {
bail!("failpoint before-copy-layer")
});
pausable_failpoint!("before-copy-layer-pausable");
storage
.copy_object(source_path, target_path, cancel)
.await
.with_context(|| format!("copy layer {source_path} to {target_path}"))
.with_context(|| format!("upload layer from local path '{source_path}'"))
}
/// Uploads the given `initdb` data to the remote storage.

View File

@@ -21,9 +21,8 @@ use self::{
use super::{
config::{SecondaryLocationConfig, TenantConfOpt},
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
span::debug_assert_current_span_has_tenant_id,
storage_layer::{layer::local_layer_path, LayerName},
storage_layer::LayerFileName,
};
use pageserver_api::{
@@ -182,8 +181,7 @@ impl SecondaryTenant {
self: &Arc<Self>,
conf: &PageServerConf,
timeline_id: TimelineId,
name: LayerName,
metadata: LayerFileMetadata,
name: LayerFileName,
) {
debug_assert_current_span_has_tenant_id();
@@ -197,13 +195,9 @@ impl SecondaryTenant {
let now = SystemTime::now();
let local_path = local_layer_path(
conf,
&self.tenant_shard_id,
&timeline_id,
&name,
&metadata.generation,
);
let path = conf
.timeline_path(&self.tenant_shard_id, &timeline_id)
.join(name.file_name());
let this = self.clone();
@@ -214,7 +208,7 @@ impl SecondaryTenant {
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
let deleted = std::fs::remove_file(local_path);
let deleted = std::fs::remove_file(path);
let not_found = deleted
.as_ref()

View File

@@ -22,7 +22,7 @@ use crate::{
FAILED_REMOTE_OP_RETRIES,
},
span::debug_assert_current_span_has_tenant_id,
storage_layer::{layer::local_layer_path, LayerName},
storage_layer::LayerFileName,
tasks::{warn_when_period_overrun, BackgroundLoopKind},
},
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
@@ -111,7 +111,7 @@ impl OnDiskState {
_conf: &'static PageServerConf,
_tenant_shard_id: &TenantShardId,
_imeline_id: &TimelineId,
_ame: LayerName,
_ame: LayerFileName,
metadata: LayerFileMetadata,
access_time: SystemTime,
) -> Self {
@@ -124,10 +124,10 @@ impl OnDiskState {
#[derive(Debug, Clone, Default)]
pub(super) struct SecondaryDetailTimeline {
pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
/// We remember when layers were evicted, to prevent re-downloading them.
pub(super) evicted_at: HashMap<LayerName, SystemTime>,
pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
}
/// This state is written by the secondary downloader, it is opaque
@@ -621,12 +621,12 @@ impl<'a> TenantDownloader<'a> {
let layers_in_heatmap = heatmap_timeline
.layers
.iter()
.map(|l| (&l.name, l.metadata.generation))
.map(|l| &l.name)
.collect::<HashSet<_>>();
let layers_on_disk = timeline_state
.on_disk_layers
.iter()
.map(|l| (l.0, l.1.metadata.generation))
.map(|l| l.0)
.collect::<HashSet<_>>();
let mut layer_count = layers_on_disk.len();
@@ -637,24 +637,16 @@ impl<'a> TenantDownloader<'a> {
.sum();
// Remove on-disk layers that are no longer present in heatmap
for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) {
for layer in layers_on_disk.difference(&layers_in_heatmap) {
layer_count -= 1;
layer_byte_count -= timeline_state
.on_disk_layers
.get(layer_file_name)
.get(layer)
.unwrap()
.metadata
.file_size();
let local_path = local_layer_path(
self.conf,
self.secondary_state.get_tenant_shard_id(),
timeline_id,
layer_file_name,
generation,
);
delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path));
delete_layers.push((*timeline_id, (*layer).clone()));
}
progress.bytes_downloaded += layer_byte_count;
@@ -669,7 +661,11 @@ impl<'a> TenantDownloader<'a> {
}
// Execute accumulated deletions
for (timeline_id, layer_name, local_path) in delete_layers {
for (timeline_id, layer_name) in delete_layers {
let timeline_path = self
.conf
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
let local_path = timeline_path.join(layer_name.to_string());
tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
tokio::fs::remove_file(&local_path)
@@ -758,6 +754,9 @@ impl<'a> TenantDownloader<'a> {
) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
let timeline_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id);
// Accumulate updates to the state
let mut touched = Vec::new();
@@ -807,14 +806,10 @@ impl<'a> TenantDownloader<'a> {
if cfg!(debug_assertions) {
// Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
// are already present on disk are really there.
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&layer.name,
&layer.metadata.generation,
);
let local_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id)
.join(layer.name.file_name());
match tokio::fs::metadata(&local_path).await {
Ok(meta) => {
tracing::debug!(
@@ -908,13 +903,7 @@ impl<'a> TenantDownloader<'a> {
};
if downloaded_bytes != layer.metadata.file_size {
let local_path = local_layer_path(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
&layer.name,
&layer.metadata.generation,
);
let local_path = timeline_path.join(layer.name.to_string());
tracing::warn!(
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
@@ -997,7 +986,7 @@ async fn init_timeline_state(
// As we iterate through layers found on disk, we will look up their metadata from this map.
// Layers not present in metadata will be discarded.
let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
while let Some(dentry) = dir
@@ -1034,7 +1023,7 @@ async fn init_timeline_state(
continue;
}
match LayerName::from_str(file_name) {
match LayerFileName::from_str(file_name) {
Ok(name) => {
let remote_meta = heatmap_metadata.get(&name);
match remote_meta {

View File

@@ -1,6 +1,8 @@
use std::time::SystemTime;
use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
use crate::tenant::{
remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -29,7 +31,7 @@ pub(crate) struct HeatMapTimeline {
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(super) name: LayerName,
pub(super) name: LayerFileName,
pub(super) metadata: IndexLayerMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
@@ -40,7 +42,7 @@ pub(crate) struct HeatMapLayer {
impl HeatMapLayer {
pub(crate) fn new(
name: LayerName,
name: LayerFileName,
metadata: IndexLayerMetadata,
access_time: SystemTime,
) -> Self {

View File

@@ -1,11 +1,11 @@
//! Common traits and structs for layers
pub mod delta_layer;
mod filename;
pub mod image_layer;
pub(crate) mod inmemory_layer;
pub(crate) mod layer;
mod layer_desc;
mod layer_name;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
@@ -34,10 +34,10 @@ use utils::rate_limit::RateLimit;
use utils::{id::TimelineId, lsn::Lsn};
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
pub use image_layer::{ImageLayer, ImageLayerWriter};
pub use inmemory_layer::InMemoryLayer;
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
@@ -646,8 +646,8 @@ pub mod tests {
use super::*;
impl From<DeltaLayerName> for PersistentLayerDesc {
fn from(value: DeltaLayerName) -> Self {
impl From<DeltaFileName> for PersistentLayerDesc {
fn from(value: DeltaFileName) -> Self {
PersistentLayerDesc::new_delta(
TenantShardId::from([0; 18]),
TimelineId::from_array([0; 16]),
@@ -658,8 +658,8 @@ pub mod tests {
}
}
impl From<ImageLayerName> for PersistentLayerDesc {
fn from(value: ImageLayerName) -> Self {
impl From<ImageFileName> for PersistentLayerDesc {
fn from(value: ImageFileName) -> Self {
PersistentLayerDesc::new_img(
TenantShardId::from([0; 18]),
TimelineId::from_array([0; 16]),
@@ -670,11 +670,11 @@ pub mod tests {
}
}
impl From<LayerName> for PersistentLayerDesc {
fn from(value: LayerName) -> Self {
impl From<LayerFileName> for PersistentLayerDesc {
fn from(value: LayerFileName) -> Self {
match value {
LayerName::Delta(d) => Self::from(d),
LayerName::Image(i) => Self::from(i),
LayerFileName::Delta(d) => Self::from(d),
LayerFileName::Image(i) => Self::from(i),
}
}
}

View File

@@ -57,7 +57,6 @@ use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::OnceCell;
use tracing::*;
@@ -69,8 +68,7 @@ use utils::{
};
use super::{
AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
ValuesReconstructState,
AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
};
///
@@ -311,13 +309,13 @@ impl DeltaLayer {
.and_then(|res| res)?;
// not production code
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
let expected_layer_name = self.layer_desc().layer_name();
let actual_filename = path.file_name().unwrap().to_owned();
let expected_filename = self.layer_desc().filename().file_name();
if actual_layer_name != expected_layer_name {
if actual_filename != expected_filename {
println!("warning: filename does not match what is expected from in-file summary");
println!("actual: {:?}", actual_layer_name.to_string());
println!("expected: {:?}", expected_layer_name.to_string());
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
}
Ok(Arc::new(loaded))
@@ -1141,15 +1139,15 @@ impl DeltaLayerInner {
Ok(all_keys)
}
/// Using the given writer, write out a version which has the earlier Lsns than `until`.
///
/// Return the amount of key value records pushed to the writer.
/// Using the given writer, write out a truncated version, where LSNs higher than the
/// truncate_at are missing.
#[cfg(test)]
pub(super) async fn copy_prefix(
&self,
writer: &mut DeltaLayerWriter,
until: Lsn,
truncate_at: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
) -> anyhow::Result<()> {
use crate::tenant::vectored_blob_io::{
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
};
@@ -1213,8 +1211,6 @@ impl DeltaLayerInner {
// FIXME: buffering of DeltaLayerWriter
let mut per_blob_copy = Vec::new();
let mut records = 0;
while let Some(item) = stream.try_next().await? {
tracing::debug!(?item, "popped");
let offset = item
@@ -1233,7 +1229,7 @@ impl DeltaLayerInner {
prev = Option::from(item);
let actionable = actionable.filter(|x| x.0.lsn < until);
let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
let builder = if let Some((meta, offsets)) = actionable {
// extend or create a new builder
@@ -1301,7 +1297,7 @@ impl DeltaLayerInner {
let will_init = crate::repository::ValueBytes::will_init(data)
.inspect_err(|_e| {
#[cfg(feature = "testing")]
tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
})
.unwrap_or(false);
@@ -1318,10 +1314,7 @@ impl DeltaLayerInner {
)
.await;
per_blob_copy = tmp;
res?;
records += 1;
}
buffer = Some(res.buf);
@@ -1333,7 +1326,7 @@ impl DeltaLayerInner {
"with the sentinel above loop should had handled all"
);
Ok(records)
Ok(())
}
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
@@ -1406,6 +1399,7 @@ impl DeltaLayerInner {
Ok(())
}
#[cfg(test)]
fn stream_index_forwards<'a, R>(
&'a self,
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,

View File

@@ -2,42 +2,40 @@
//! Helper functions for dealing with filenames of the image and delta layer files.
//!
use crate::repository::Key;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::fmt;
use std::ops::Range;
use std::str::FromStr;
use regex::Regex;
use utils::lsn::Lsn;
use super::PersistentLayerDesc;
// Note: Timeline::load_layer_map() relies on this sort order
#[derive(PartialEq, Eq, Clone, Hash)]
pub struct DeltaLayerName {
pub struct DeltaFileName {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
}
impl std::fmt::Debug for DeltaLayerName {
impl std::fmt::Debug for DeltaFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use super::RangeDisplayDebug;
f.debug_struct("DeltaLayerName")
f.debug_struct("DeltaFileName")
.field("key_range", &RangeDisplayDebug(&self.key_range))
.field("lsn_range", &self.lsn_range)
.finish()
}
}
impl PartialOrd for DeltaLayerName {
impl PartialOrd for DeltaFileName {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for DeltaLayerName {
impl Ord for DeltaFileName {
fn cmp(&self, other: &Self) -> Ordering {
let mut cmp = self.key_range.start.cmp(&other.key_range.start);
if cmp != Ordering::Equal {
@@ -57,14 +55,16 @@ impl Ord for DeltaLayerName {
}
}
/// Represents the region of the LSN-Key space covered by a DeltaLayer
/// Represents the filename of a DeltaLayer
///
/// ```text
/// <key start>-<key end>__<LSN start>-<LSN end>
/// ```
impl DeltaLayerName {
/// Parse the part of a delta layer's file name that represents the LayerName. Returns None
/// if the filename does not match the expected pattern.
impl DeltaFileName {
///
/// Parse a string as a delta file name. Returns None if the filename does not
/// match the expected pattern.
///
pub fn parse_str(fname: &str) -> Option<Self> {
let mut parts = fname.split("__");
let mut key_parts = parts.next()?.split('-');
@@ -74,19 +74,10 @@ impl DeltaLayerName {
let key_end_str = key_parts.next()?;
let lsn_start_str = lsn_parts.next()?;
let lsn_end_str = lsn_parts.next()?;
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
return None;
}
if key_start_str.len() != 36
|| key_end_str.len() != 36
|| lsn_start_str.len() != 16
|| lsn_end_str.len() != 16
{
return None;
}
let key_start = Key::from_hex(key_start_str).ok()?;
let key_end = Key::from_hex(key_end_str).ok()?;
@@ -103,14 +94,14 @@ impl DeltaLayerName {
// or panic?
}
Some(DeltaLayerName {
Some(DeltaFileName {
key_range: key_start..key_end,
lsn_range: start_lsn..end_lsn,
})
}
}
impl fmt::Display for DeltaLayerName {
impl fmt::Display for DeltaFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
@@ -124,29 +115,29 @@ impl fmt::Display for DeltaLayerName {
}
#[derive(PartialEq, Eq, Clone, Hash)]
pub struct ImageLayerName {
pub struct ImageFileName {
pub key_range: Range<Key>,
pub lsn: Lsn,
}
impl std::fmt::Debug for ImageLayerName {
impl std::fmt::Debug for ImageFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use super::RangeDisplayDebug;
f.debug_struct("ImageLayerName")
f.debug_struct("ImageFileName")
.field("key_range", &RangeDisplayDebug(&self.key_range))
.field("lsn", &self.lsn)
.finish()
}
}
impl PartialOrd for ImageLayerName {
impl PartialOrd for ImageFileName {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ImageLayerName {
impl Ord for ImageFileName {
fn cmp(&self, other: &Self) -> Ordering {
let mut cmp = self.key_range.start.cmp(&other.key_range.start);
if cmp != Ordering::Equal {
@@ -162,7 +153,7 @@ impl Ord for ImageLayerName {
}
}
impl ImageLayerName {
impl ImageFileName {
pub fn lsn_as_range(&self) -> Range<Lsn> {
// Saves from having to copypaste this all over
PersistentLayerDesc::image_layer_lsn_range(self.lsn)
@@ -170,14 +161,16 @@ impl ImageLayerName {
}
///
/// Represents the part of the Key-LSN space covered by an ImageLayer
/// Represents the filename of an ImageLayer
///
/// ```text
/// <key start>-<key end>__<LSN>
/// ```
impl ImageLayerName {
/// Parse a string as then LayerName part of an image layer file name. Returns None if the
/// filename does not match the expected pattern.
impl ImageFileName {
///
/// Parse a string as an image file name. Returns None if the filename does not
/// match the expected pattern.
///
pub fn parse_str(fname: &str) -> Option<Self> {
let mut parts = fname.split("__");
let mut key_parts = parts.next()?.split('-');
@@ -189,23 +182,19 @@ impl ImageLayerName {
return None;
}
if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
return None;
}
let key_start = Key::from_hex(key_start_str).ok()?;
let key_end = Key::from_hex(key_end_str).ok()?;
let lsn = Lsn::from_hex(lsn_str).ok()?;
Some(ImageLayerName {
Some(ImageFileName {
key_range: key_start..key_end,
lsn,
})
}
}
impl fmt::Display for ImageLayerName {
impl fmt::Display for ImageFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
@@ -216,24 +205,21 @@ impl fmt::Display for ImageLayerName {
)
}
}
/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The
/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
/// over time (e.g. across shard splits or compression). The physical filenames of layers in local
/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
/// and [`crate::tenant::storage_layer::layer::local_layer_path`])
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
pub enum LayerName {
Image(ImageLayerName),
Delta(DeltaLayerName),
pub enum LayerFileName {
Image(ImageFileName),
Delta(DeltaFileName),
}
impl LayerName {
impl LayerFileName {
pub fn file_name(&self) -> String {
self.to_string()
}
/// Determines if this layer file is considered to be in future meaning we will discard these
/// layers during timeline initialization from the given disk_consistent_lsn.
pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
use LayerName::*;
use LayerFileName::*;
match self {
Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
@@ -242,7 +228,7 @@ impl LayerName {
}
pub(crate) fn kind(&self) -> &'static str {
use LayerName::*;
use LayerFileName::*;
match self {
Delta(_) => "delta",
Image(_) => "image",
@@ -250,7 +236,7 @@ impl LayerName {
}
}
impl fmt::Display for LayerName {
impl fmt::Display for LayerFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Image(fname) => write!(f, "{fname}"),
@@ -259,36 +245,23 @@ impl fmt::Display for LayerName {
}
}
impl From<ImageLayerName> for LayerName {
fn from(fname: ImageLayerName) -> Self {
impl From<ImageFileName> for LayerFileName {
fn from(fname: ImageFileName) -> Self {
Self::Image(fname)
}
}
impl From<DeltaLayerName> for LayerName {
fn from(fname: DeltaLayerName) -> Self {
impl From<DeltaFileName> for LayerFileName {
fn from(fname: DeltaFileName) -> Self {
Self::Delta(fname)
}
}
impl FromStr for LayerName {
impl FromStr for LayerFileName {
type Err = String;
/// Conversion from either a physical layer filename, or the string-ization of
/// Self. When loading a physical layer filename, we drop any extra information
/// not needed to build Self.
fn from_str(value: &str) -> Result<Self, Self::Err> {
let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
Some(captures) => captures
.name("base")
.expect("Non-optional group")
.as_str()
.into(),
None => value.into(),
};
let delta = DeltaLayerName::parse_str(&file_name);
let image = ImageLayerName::parse_str(&file_name);
let delta = DeltaFileName::parse_str(value);
let image = ImageFileName::parse_str(value);
let ok = match (delta, image) {
(None, None) => {
return Err(format!(
@@ -303,7 +276,7 @@ impl FromStr for LayerName {
}
}
impl serde::Serialize for LayerName {
impl serde::Serialize for LayerFileName {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
@@ -315,19 +288,19 @@ impl serde::Serialize for LayerName {
}
}
impl<'de> serde::Deserialize<'de> for LayerName {
impl<'de> serde::Deserialize<'de> for LayerFileName {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
deserializer.deserialize_string(LayerNameVisitor)
deserializer.deserialize_string(LayerFileNameVisitor)
}
}
struct LayerNameVisitor;
struct LayerFileNameVisitor;
impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
type Value = LayerName;
impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
type Value = LayerFileName;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(
@@ -342,42 +315,3 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
v.parse().map_err(|e| E::custom(e))
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn image_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Image(ImageLayerName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected,);
Ok(())
}
#[test]
fn delta_layer_parse() -> anyhow::Result<()> {
let expected = LayerName::Delta(DeltaLayerName {
key_range: Key::from_i128(0)
..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
..Lsn::from_hex("000000000154C481").unwrap(),
});
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
// Omitting generation suffix is valid
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
assert_eq!(parsed, expected);
Ok(())
}
}

View File

@@ -54,7 +54,6 @@ use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::str::FromStr;
use std::sync::Arc;
use tokio::sync::OnceCell;
use tokio_stream::StreamExt;
@@ -66,10 +65,8 @@ use utils::{
lsn::Lsn,
};
use super::layer_name::ImageLayerName;
use super::{
AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
};
use super::filename::ImageFileName;
use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
///
/// Header stored in the beginning of the file
@@ -234,7 +231,7 @@ impl ImageLayer {
conf: &PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
fname: &ImageLayerName,
fname: &ImageFileName,
) -> Utf8PathBuf {
let rand_string: String = rand::thread_rng()
.sample_iter(&Alphanumeric)
@@ -270,13 +267,13 @@ impl ImageLayer {
.and_then(|res| res)?;
// not production code
let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
let expected_layer_name = self.layer_desc().layer_name();
let actual_filename = path.file_name().unwrap().to_owned();
let expected_filename = self.layer_desc().filename().file_name();
if actual_layer_name != expected_layer_name {
if actual_filename != expected_filename {
println!("warning: filename does not match what is expected from in-file summary");
println!("actual: {:?}", actual_layer_name.to_string());
println!("expected: {:?}", expected_layer_name.to_string());
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
}
Ok(loaded)
@@ -638,7 +635,7 @@ impl ImageLayerWriterInner {
conf,
timeline_id,
tenant_shard_id,
&ImageLayerName {
&ImageFileName {
key_range: key_range.clone(),
lsn,
},

View File

@@ -4,13 +4,12 @@ use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use pageserver_api::shard::ShardIndex;
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
use std::time::{Duration, SystemTime};
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::sync::heavier_once_cell;
@@ -25,7 +24,7 @@ use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
use super::delta_layer::{self, DeltaEntry};
use super::image_layer;
use super::{
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
};
@@ -124,42 +123,14 @@ impl PartialEq for Layer {
}
}
pub(crate) fn local_layer_path(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
layer_file_name: &LayerName,
_generation: &Generation,
) -> Utf8PathBuf {
let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
timeline_path.join(layer_file_name.to_string())
// TODO: switch to enabling new-style layer paths after next release
// if generation.is_none() {
// // Without a generation, we may only use legacy path style
// timeline_path.join(layer_file_name.to_string())
// } else {
// timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
// }
}
impl Layer {
/// Creates a layer value for a file we know to not be resident.
pub(crate) fn for_evicted(
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
file_name: LayerName,
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> Self {
let local_path = local_layer_path(
conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&file_name,
&metadata.generation,
);
let desc = PersistentLayerDesc::from_filename(
timeline.tenant_shard_id,
timeline.timeline_id,
@@ -172,7 +143,6 @@ impl Layer {
let owner = Layer(Arc::new(LayerInner::new(
conf,
timeline,
local_path,
access_stats,
desc,
None,
@@ -189,8 +159,7 @@ impl Layer {
pub(crate) fn for_resident(
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
local_path: Utf8PathBuf,
file_name: LayerName,
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> ResidentLayer {
let desc = PersistentLayerDesc::from_filename(
@@ -215,7 +184,6 @@ impl Layer {
LayerInner::new(
conf,
timeline,
local_path,
access_stats,
desc,
Some(inner),
@@ -257,19 +225,9 @@ impl Layer {
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
let local_path = local_layer_path(
conf,
&timeline.tenant_shard_id,
&timeline.timeline_id,
&desc.layer_name(),
&timeline.generation,
);
LayerInner::new(
conf,
timeline,
local_path,
access_stats,
desc,
Some(inner),
@@ -452,13 +410,6 @@ impl Layer {
self.0.metadata()
}
pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
self.0
.timeline
.upgrade()
.map(|timeline| timeline.timeline_id)
}
/// Traditional debug dumping facility
#[allow(unused)]
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> {
@@ -690,7 +641,7 @@ impl Drop for LayerInner {
let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
let path = std::mem::take(&mut self.path);
let file_name = self.layer_desc().layer_name();
let file_name = self.layer_desc().filename();
let file_size = self.layer_desc().file_size;
let timeline = self.timeline.clone();
let meta = self.metadata();
@@ -758,17 +709,19 @@ impl Drop for LayerInner {
}
impl LayerInner {
#[allow(clippy::too_many_arguments)]
fn new(
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
local_path: Utf8PathBuf,
access_stats: LayerAccessStats,
desc: PersistentLayerDesc,
downloaded: Option<Arc<DownloadedLayer>>,
generation: Generation,
shard: ShardIndex,
) -> Self {
let path = conf
.timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
.join(desc.filename().to_string());
let (inner, version, init_status) = if let Some(inner) = downloaded {
let version = inner.version;
let resident = ResidentOrWantedEvicted::Resident(inner);
@@ -783,10 +736,8 @@ impl LayerInner {
LayerInner {
conf,
debug_str: {
format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
},
path: local_path,
debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
path,
desc,
timeline: Arc::downgrade(timeline),
have_remote_client: timeline.remote_client.is_some(),
@@ -1123,7 +1074,7 @@ impl LayerInner {
let result = client
.download_layer_file(
&self.desc.layer_name(),
&self.desc.filename(),
&self.metadata(),
&timeline.cancel,
ctx,
@@ -1260,7 +1211,7 @@ impl LayerInner {
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_name = self.desc.layer_name().to_string();
let layer_file_name = self.desc.filename().file_name();
let resident = self
.inner
@@ -1274,7 +1225,7 @@ impl LayerInner {
let lsn_range = &self.desc.lsn_range;
HistoricLayerInfo::Delta {
layer_file_name: layer_name,
layer_file_name,
layer_file_size: self.desc.file_size,
lsn_start: lsn_range.start,
lsn_end: lsn_range.end,
@@ -1285,7 +1236,7 @@ impl LayerInner {
let lsn = self.desc.image_layer_lsn();
HistoricLayerInfo::Image {
layer_file_name: layer_name,
layer_file_name,
layer_file_size: self.desc.file_size,
lsn_start: lsn,
remote: !resident,
@@ -1846,23 +1797,25 @@ impl ResidentLayer {
}
}
/// Returns the amount of keys and values written to the writer.
pub(crate) async fn copy_delta_prefix(
/// FIXME: truncate is bad name because we are not truncating anything, but copying the
/// filtered parts.
#[cfg(test)]
pub(super) async fn copy_delta_prefix(
&self,
writer: &mut super::delta_layer::DeltaLayerWriter,
until: Lsn,
truncate_at: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<usize> {
) -> anyhow::Result<()> {
use LayerKind::*;
let owner = &self.owner.0;
match self.downloaded.get(owner, ctx).await? {
Delta(ref d) => d
.copy_prefix(writer, until, ctx)
.copy_prefix(writer, truncate_at, ctx)
.await
.with_context(|| format!("copy_delta_prefix until {until} of {self}")),
Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")),
.with_context(|| format!("truncate {self}")),
Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
}
}

View File

@@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn};
use crate::repository::Key;
use super::{DeltaLayerName, ImageLayerName, LayerName};
use super::{DeltaFileName, ImageFileName, LayerFileName};
use serde::{Deserialize, Serialize};
@@ -51,7 +51,7 @@ impl PersistentLayerDesc {
}
pub fn short_id(&self) -> impl Display {
self.layer_name()
self.filename()
}
#[cfg(test)]
@@ -103,14 +103,14 @@ impl PersistentLayerDesc {
pub fn from_filename(
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
filename: LayerName,
filename: LayerFileName,
file_size: u64,
) -> Self {
match filename {
LayerName::Image(i) => {
LayerFileName::Image(i) => {
Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
}
LayerName::Delta(d) => Self::new_delta(
LayerFileName::Delta(d) => Self::new_delta(
tenant_shard_id,
timeline_id,
d.key_range,
@@ -132,34 +132,34 @@ impl PersistentLayerDesc {
lsn..(lsn + 1)
}
/// Get a delta layer name for this layer.
/// Get a delta file name for this layer.
///
/// Panic: if this is not a delta layer.
pub fn delta_layer_name(&self) -> DeltaLayerName {
pub fn delta_file_name(&self) -> DeltaFileName {
assert!(self.is_delta);
DeltaLayerName {
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
}
/// Get a image layer name for this layer.
/// Get a delta file name for this layer.
///
/// Panic: if this is not an image layer, or the lsn range is invalid
pub fn image_layer_name(&self) -> ImageLayerName {
pub fn image_file_name(&self) -> ImageFileName {
assert!(!self.is_delta);
assert!(self.lsn_range.start + 1 == self.lsn_range.end);
ImageLayerName {
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
}
}
pub fn layer_name(&self) -> LayerName {
pub fn filename(&self) -> LayerFileName {
if self.is_delta {
self.delta_layer_name().into()
self.delta_file_name().into()
} else {
self.image_layer_name().into()
self.image_file_name().into()
}
}

View File

@@ -2,7 +2,6 @@
//! such as compaction and GC
use std::ops::ControlFlow;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant};
@@ -10,11 +9,9 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD;
use crate::tenant::throttle::Stats;
use crate::tenant::timeline::CompactionError;
use crate::tenant::{Tenant, TenantState};
use rand::Rng;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{backoff, completion};
@@ -47,7 +44,6 @@ pub(crate) enum BackgroundLoopKind {
Compaction,
Gc,
Eviction,
IngestHouseKeeping,
ConsumptionMetricsCollectMetrics,
ConsumptionMetricsSyntheticSizeWorker,
InitialLogicalSizeCalculation,
@@ -136,30 +132,6 @@ pub fn start_background_loops(
}
},
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::IngestHousekeeping,
Some(tenant_shard_id),
None,
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
false,
{
let tenant = Arc::clone(tenant);
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
ingest_housekeeping_loop(tenant, cancel)
.instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.await;
Ok(())
}
},
);
}
///
@@ -407,61 +379,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
loop {
tokio::select! {
_ = cancel.cancelled() => {
return;
},
tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(()) => (),
},
}
// We run ingest housekeeping with the same frequency as compaction: it is not worth
// having a distinct setting. But we don't run it in the same task, because compaction
// blocks on acquiring the background job semaphore.
let period = tenant.get_compaction_period();
// If compaction period is set to zero (to disable it), then we will use a reasonable default
let period = if period == Duration::ZERO {
humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD)
.unwrap()
.into()
} else {
period
};
// Jitter the period by +/- 5%
let period =
rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100);
// Always sleep first: we do not need to do ingest housekeeping early in the lifetime of
// a tenant, since it won't have started writing any ephemeral files yet.
if tokio::time::timeout(period, cancel.cancelled())
.await
.is_ok()
{
break;
}
let started_at = Instant::now();
tenant.ingest_housekeeping().await;
warn_when_period_overrun(
started_at.elapsed(),
period,
BackgroundLoopKind::IngestHouseKeeping,
);
}
}
.await;
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
}
async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
// if the tenant has a proper status already, no need to wait for anything
if tenant.current_state() == TenantState::Active {
@@ -503,6 +420,8 @@ pub(crate) async fn random_init_delay(
period: Duration,
cancel: &CancellationToken,
) -> Result<(), Cancelled> {
use rand::Rng;
if period == Duration::ZERO {
return Ok(());
}

View File

@@ -1,6 +1,5 @@
mod compaction;
pub mod delete;
pub(crate) mod detach_ancestor;
mod eviction_task;
mod init;
pub mod layer_manager;
@@ -23,9 +22,8 @@ use pageserver_api::{
},
keyspace::{KeySpaceAccum, SparseKeyPartitioning},
models::{
AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
TimelineState,
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
},
reltag::BlockNumber,
shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -60,7 +58,6 @@ use std::{
ops::ControlFlow,
};
use crate::tenant::timeline::init::LocalLayerFileMetadata;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
metadata::TimelineMetadata,
@@ -75,7 +72,7 @@ use crate::{
disk_usage_eviction_task::finite_f32,
tenant::storage_layer::{
AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
ValueReconstructState, ValuesReconstructState,
},
};
@@ -865,13 +862,9 @@ impl Timeline {
// Initialise the reconstruct state for the key with the cache
// entry returned above.
let mut reconstruct_state = ValuesReconstructState::new();
// Only add the cached image to the reconstruct state when it exists.
if cached_page_img.is_some() {
let mut key_state = VectoredValueReconstructState::default();
key_state.img = cached_page_img;
reconstruct_state.keys.insert(key, Ok(key_state));
}
let mut key_state = VectoredValueReconstructState::default();
key_state.img = cached_page_img;
reconstruct_state.keys.insert(key, Ok(key_state));
let vectored_res = self
.get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
@@ -1083,7 +1076,7 @@ impl Timeline {
// We should generalize this into Keyspace::contains in the future.
for range in &keyspace.ranges {
if range.start.field1 < METADATA_KEY_BEGIN_PREFIX
|| range.end.field1 > METADATA_KEY_END_PREFIX
|| range.end.field1 >= METADATA_KEY_END_PREFIX
{
return Err(GetVectoredError::Other(anyhow::anyhow!(
"only metadata keyspace can be scanned"
@@ -1220,17 +1213,11 @@ impl Timeline {
}
reconstruct_timer.stop_and_record();
// For aux file keys (v1 or v2) the vectored read path does not return an error
// when they're missing. Instead they are omitted from the resulting btree
// (this is a requirement, not a bug). Skip updating the metric in these cases
// to avoid infinite results.
if !results.is_empty() {
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient
// (i.e. segment tree tracking each range queried from a layer)
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
.observe(layers_visited as f64 / results.len() as f64);
}
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient
// (i.e. segment tree tracking each range queried from a layer)
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
.observe(layers_visited as f64 / results.len() as f64);
Ok(results)
}
@@ -1507,21 +1494,15 @@ impl Timeline {
/// Flush to disk all data that was written with the put_* functions
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
self.freeze_and_flush0().await
}
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
// polluting the span hierarchy.
pub(crate) async fn freeze_and_flush0(&self) -> anyhow::Result<()> {
let to_lsn = self.freeze_inmem_layer(false).await;
self.flush_frozen_layers_and_wait(to_lsn).await
}
// Check if an open ephemeral layer should be closed: this provides
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
// an ephemeral layer open forever when idle. It also freezes layers if the global limit on
// ephemeral layer bytes has been breached.
pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
/// If there is no writer, and conditions for rolling the latest layer are met, then freeze it.
///
/// This is for use in background housekeeping, to provide guarantees of layers closing eventually
/// even if there are no ongoing writes to drive that.
async fn maybe_freeze_ephemeral_layer(&self) {
let Ok(_write_guard) = self.write_lock.try_lock() else {
// If the write lock is held, there is an active wal receiver: rolling open layers
// is their responsibility while they hold this lock.
@@ -1548,11 +1529,13 @@ impl Timeline {
// we are a sharded tenant and have skipped some WAL
let last_freeze_ts = *self.last_freeze_ts.read().unwrap();
if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() {
// Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard
// without any data ingested (yet) doesn't write a remote index as soon as it
// This should be somewhat rare, so we log it at INFO level.
//
// We checked for checkpoint timeout so that a shard without any
// data ingested (yet) doesn't write a remote index as soon as it
// sees its LSN advance: we only do this if we've been layer-less
// for some time.
tracing::debug!(
tracing::info!(
"Advancing disk_consistent_lsn past WAL ingest gap {} -> {}",
disk_consistent_lsn,
last_record_lsn
@@ -1642,6 +1625,11 @@ impl Timeline {
(guard, permit)
};
// Prior to compaction, check if an open ephemeral layer should be closed: this provides
// background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping
// an ephemeral layer open forever when idle.
self.maybe_freeze_ephemeral_layer().await;
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let (_guard, _permit) = tokio::select! {
@@ -1911,7 +1899,7 @@ impl Timeline {
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
pub(crate) async fn download_layer(
&self,
layer_file_name: &LayerName,
layer_file_name: &str,
) -> anyhow::Result<Option<bool>> {
let Some(layer) = self.find_layer(layer_file_name).await else {
return Ok(None);
@@ -1929,10 +1917,7 @@ impl Timeline {
/// Evict just one layer.
///
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
pub(crate) async fn evict_layer(
&self,
layer_file_name: &LayerName,
) -> anyhow::Result<Option<bool>> {
pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let _gate = self
.gate
.enter()
@@ -2006,12 +1991,13 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
#[allow(dead_code)]
pub(crate) fn get_switch_to_aux_file_v2(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.switch_aux_file_policy
.unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
.switch_to_aux_file_v2
.unwrap_or(self.conf.default_tenant_conf.switch_to_aux_file_v2)
}
pub(crate) fn get_lazy_slru_download(&self) -> bool {
@@ -2393,13 +2379,13 @@ impl Timeline {
index_part: Option<IndexPart>,
) -> anyhow::Result<()> {
use init::{Decision::*, Discovered, DismissedLayer};
use LayerName::*;
use LayerFileName::*;
let mut guard = self.layers.write().await;
let timer = self.metrics.load_layer_map_histo.start_timer();
// Scan timeline directory and create ImageLayerName and DeltaFilename
// Scan timeline directory and create ImageFileName and DeltaFilename
// structs representing all files on disk
let timeline_path = self
.conf
@@ -2423,8 +2409,8 @@ impl Timeline {
for discovered in discovered {
let (name, kind) = match discovered {
Discovered::Layer(layer_file_name, local_path, file_size) => {
discovered_layers.push((layer_file_name, local_path, file_size));
Discovered::Layer(file_name, file_size) => {
discovered_layers.push((file_name, file_size));
continue;
}
Discovered::Metadata => {
@@ -2474,30 +2460,31 @@ impl Timeline {
Ok(UseRemote { local, remote }) => {
// Remote is authoritative, but we may still choose to retain
// the local file if the contents appear to match
if local.metadata.file_size() == remote.file_size() {
if local.file_size() == remote.file_size() {
// Use the local file, but take the remote metadata so that we pick up
// the correct generation.
UseLocal(
LocalLayerFileMetadata {
metadata: remote,
local_path: local.local_path
}
)
UseLocal(remote)
} else {
init::cleanup_local_file_for_remote(&local, &remote)?;
path.push(name.file_name());
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
path.pop();
UseRemote { local, remote }
}
}
Ok(decision) => decision,
Err(DismissedLayer::Future { local }) => {
if let Some(local) = local {
init::cleanup_future_layer(&local.local_path, &name, disk_consistent_lsn)?;
if local.is_some() {
path.push(name.file_name());
init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
path.pop();
}
needs_cleanup.push(name);
continue;
}
Err(DismissedLayer::LocalOnly(local)) => {
init::cleanup_local_only_file(&name, &local)?;
path.push(name.file_name());
init::cleanup_local_only_file(&path, &name, &local)?;
path.pop();
// this file never existed remotely, we will have to do rework
continue;
}
@@ -2511,9 +2498,9 @@ impl Timeline {
tracing::debug!(layer=%name, ?decision, "applied");
let layer = match decision {
UseLocal(local) => {
total_physical_size += local.metadata.file_size();
Layer::for_resident(conf, &this, local.local_path, name, local.metadata).drop_eviction_guard()
UseLocal(m) => {
total_physical_size += m.file_size();
Layer::for_resident(conf, &this, name, m).drop_eviction_guard()
}
Evicted(remote) | UseRemote { remote, .. } => {
Layer::for_evicted(conf, &this, name, remote)
@@ -2994,11 +2981,11 @@ impl Timeline {
}
}
async fn find_layer(&self, layer_name: &LayerName) -> Option<Layer> {
async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
let guard = self.layers.read().await;
for historic_layer in guard.layer_map().iter_historic_layers() {
let historic_layer_name = historic_layer.layer_name();
if layer_name == &historic_layer_name {
let historic_layer_name = historic_layer.filename().file_name();
if layer_file_name == historic_layer_name {
return Some(guard.get_from_desc(&historic_layer));
}
}
@@ -3027,8 +3014,8 @@ impl Timeline {
let last_activity_ts = layer.access_stats().latest_activity_or_now();
HeatMapLayer::new(
layer.layer_desc().layer_name(),
(&layer.metadata()).into(),
layer.layer_desc().filename(),
layer.metadata().into(),
last_activity_ts,
)
});
@@ -3037,18 +3024,6 @@ impl Timeline {
Some(HeatMapTimeline::new(self.timeline_id, layers))
}
/// Returns true if the given lsn is or was an ancestor branchpoint.
pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool {
// upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original
// branchpoint in the value in IndexPart::lineage
self.ancestor_lsn == lsn
|| (self.ancestor_lsn == Lsn::INVALID
&& self
.remote_client
.as_ref()
.is_some_and(|rtc| rtc.is_previous_ancestor_lsn(lsn)))
}
}
type TraversalId = Arc<str>;
@@ -3186,7 +3161,7 @@ impl Timeline {
if let Some(open_layer) = &layers.open_layer {
let start_lsn = open_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
//info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
// Get all the data needed to reconstruct the page version from this layer.
// But if we have an older cached page image, no need to go past that.
let lsn_floor = max(cached_lsn + 1, start_lsn);
@@ -3215,7 +3190,7 @@ impl Timeline {
for frozen_layer in layers.frozen_layers.iter().rev() {
let start_lsn = frozen_layer.get_lsn_range().start;
if cont_lsn > start_lsn {
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
let lsn_floor = max(cached_lsn + 1, start_lsn);
let frozen_layer = frozen_layer.clone();
@@ -3542,7 +3517,7 @@ impl Timeline {
Ok(ancestor)
}
pub(crate) fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
format!(
"Ancestor is missing. Timeline id: {} Ancestor id {:?}",
@@ -4242,7 +4217,7 @@ impl Timeline {
// Maybe flush `key_rest_accum`
if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
|| (last_key_in_range && key_request_accum.raw_size() > 0)
|| last_key_in_range
{
let results = self
.get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
@@ -4358,48 +4333,6 @@ impl Timeline {
_ = self.cancel.cancelled() => {}
)
}
/// Detach this timeline from its ancestor by copying all of ancestors layers as this
/// Timelines layers up to the ancestor_lsn.
///
/// Requires a timeline that:
/// - has an ancestor to detach from
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
/// a technical requirement
///
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
/// polled again until completion.
///
/// During the operation all timelines sharing the data with this timeline will be reparented
/// from our ancestor to be branches of this timeline.
pub(crate) async fn prepare_to_detach_from_ancestor(
self: &Arc<Timeline>,
tenant: &crate::tenant::Tenant,
options: detach_ancestor::Options,
ctx: &RequestContext,
) -> Result<
(
completion::Completion,
detach_ancestor::PreparedTimelineDetach,
),
detach_ancestor::Error,
> {
detach_ancestor::prepare(self, tenant, options, ctx).await
}
/// Completes the ancestor detach. This method is to be called while holding the
/// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
/// timeline be deleted. After this method returns successfully, tenant must be reloaded.
///
/// Pageserver receiving a SIGKILL during this operation is not supported (yet).
pub(crate) async fn complete_detaching_timeline_ancestor(
self: &Arc<Timeline>,
tenant: &crate::tenant::Tenant,
prepared: detach_ancestor::PreparedTimelineDetach,
ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
detach_ancestor::complete(self, tenant, prepared, ctx).await
}
}
/// Top-level failure to compact.
@@ -4508,24 +4441,6 @@ impl Timeline {
Ok(())
}
async fn rewrite_layers(
self: &Arc<Self>,
replace_layers: Vec<(Layer, ResidentLayer)>,
drop_layers: Vec<Layer>,
) -> anyhow::Result<()> {
let mut guard = self.layers.write().await;
guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);
let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
if let Some(remote_client) = self.remote_client.as_ref() {
remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
}
Ok(())
}
/// Schedules the uploads of the given image layers
fn upload_new_image_layers(
self: &Arc<Self>,
@@ -4684,8 +4599,6 @@ impl Timeline {
retain_lsns: Vec<Lsn>,
new_gc_cutoff: Lsn,
) -> anyhow::Result<GcResult> {
// FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
let now = SystemTime::now();
let mut result: GcResult = GcResult::default();
@@ -4739,7 +4652,7 @@ impl Timeline {
if l.get_lsn_range().end > horizon_cutoff {
debug!(
"keeping {} because it's newer than horizon_cutoff {}",
l.layer_name(),
l.filename(),
horizon_cutoff,
);
result.layers_needed_by_cutoff += 1;
@@ -4750,7 +4663,7 @@ impl Timeline {
if l.get_lsn_range().end > pitr_cutoff {
debug!(
"keeping {} because it's newer than pitr_cutoff {}",
l.layer_name(),
l.filename(),
pitr_cutoff,
);
result.layers_needed_by_pitr += 1;
@@ -4769,7 +4682,7 @@ impl Timeline {
if &l.get_lsn_range().start <= retain_lsn {
debug!(
"keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
l.layer_name(),
l.filename(),
retain_lsn,
l.is_incremental(),
);
@@ -4800,7 +4713,7 @@ impl Timeline {
if !layers
.image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
{
debug!("keeping {} because it is the latest layer", l.layer_name());
debug!("keeping {} because it is the latest layer", l.filename());
result.layers_not_updated += 1;
continue 'outer;
}
@@ -4808,7 +4721,7 @@ impl Timeline {
// We didn't find any reason to keep this file, so remove it.
debug!(
"garbage collecting {} is_dropped: xx is_incremental: {}",
l.layer_name(),
l.filename(),
l.is_incremental(),
);
layers_to_remove.push(l);

View File

@@ -15,8 +15,7 @@ use anyhow::{anyhow, Context};
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::keyspace::ShardedRange;
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::id::TimelineId;
@@ -94,7 +93,7 @@ impl Timeline {
// Define partitioning schema if needed
// FIXME: the match should only cover repartitioning, not the next steps
let partition_count = match self
match self
.repartition(
self.get_last_record_lsn(),
self.get_compaction_target_size(),
@@ -147,7 +146,6 @@ impl Timeline {
assert!(sparse_layers.is_empty());
self.upload_new_image_layers(dense_layers)?;
dense_partitioning.parts.len()
}
Err(err) => {
// no partitioning? This is normal, if the timeline was just created
@@ -159,150 +157,9 @@ impl Timeline {
if !self.cancel.is_cancelled() {
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
}
1
}
};
if self.shard_identity.count >= ShardCount::new(2) {
// Limit the number of layer rewrites to the number of partitions: this means its
// runtime should be comparable to a full round of image layer creations, rather than
// being potentially much longer.
let rewrite_max = partition_count;
self.compact_shard_ancestors(rewrite_max, ctx).await?;
}
Ok(())
}
/// Check for layers that are elegible to be rewritten:
/// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
/// we don't indefinitely retain keys in this shard that aren't needed.
/// - For future use: layers beyond pitr_interval that are in formats we would
/// rather not maintain compatibility with indefinitely.
///
/// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound
/// how much work it will try to do in each compaction pass.
async fn compact_shard_ancestors(
self: &Arc<Self>,
rewrite_max: usize,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut drop_layers = Vec::new();
let layers_to_rewrite: Vec<Layer> = Vec::new();
// We will use the PITR cutoff as a condition for rewriting layers.
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
let layers = self.layers.read().await;
for layer_desc in layers.layer_map().iter_historic_layers() {
let layer = layers.get_from_desc(&layer_desc);
if layer.metadata().shard.shard_count == self.shard_identity.count {
// This layer does not belong to a historic ancestor, no need to re-image it.
continue;
}
// This layer was created on an ancestor shard: check if it contains any data for this shard.
let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity);
let layer_local_page_count = sharded_range.page_count();
let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range());
if layer_local_page_count == 0 {
// This ancestral layer only covers keys that belong to other shards.
// We include the full metadata in the log: if we had some critical bug that caused
// us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
info!(%layer, old_metadata=?layer.metadata(),
"dropping layer after shard split, contains no keys for this shard.",
);
if cfg!(debug_assertions) {
// Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being
// wrong. If ShardedRange claims the local page count is zero, then no keys in this layer
// should be !is_key_disposable()
let range = layer_desc.get_key_range();
let mut key = range.start;
while key < range.end {
debug_assert!(self.shard_identity.is_key_disposable(&key));
key = key.next();
}
}
drop_layers.push(layer);
continue;
} else if layer_local_page_count != u32::MAX
&& layer_local_page_count == layer_raw_page_count
{
debug!(%layer,
"layer is entirely shard local ({} keys), no need to filter it",
layer_local_page_count
);
continue;
}
// Don't bother re-writing a layer unless it will at least halve its size
if layer_local_page_count != u32::MAX
&& layer_local_page_count > layer_raw_page_count / 2
{
debug!(%layer,
"layer is already mostly local ({}/{}), not rewriting",
layer_local_page_count,
layer_raw_page_count
);
}
// Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
// without incurring the I/O cost of a rewrite.
if layer_desc.get_lsn_range().end >= pitr_cutoff {
debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
layer_desc.get_lsn_range().end, pitr_cutoff);
continue;
}
if layer_desc.is_delta() {
// We do not yet implement rewrite of delta layers
debug!(%layer, "Skipping rewrite of delta layer");
continue;
}
// Only rewrite layers if they would have different remote paths: either they belong to this
// shard but an old generation, or they belonged to another shard. This also implicitly
// guarantees that the layer is persistent in remote storage (as only remote persistent
// layers are carried across shard splits, any local-only layer would be in the current generation)
if layer.metadata().generation == self.generation
&& layer.metadata().shard.shard_count == self.shard_identity.count
{
debug!(%layer, "Skipping rewrite, is not from old generation");
continue;
}
if layers_to_rewrite.len() >= rewrite_max {
tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
layers_to_rewrite.len()
);
continue;
}
// Fall through: all our conditions for doing a rewrite passed.
// TODO: implement rewriting
tracing::debug!(%layer, "Would rewrite layer");
}
// Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
drop(layers);
// TODO: collect layers to rewrite
let replace_layers = Vec::new();
// Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
self.rewrite_layers(replace_layers, drop_layers).await?;
if let Some(remote_client) = self.remote_client.as_ref() {
// We wait for all uploads to complete before finishing this compaction stage. This is not
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
// load.
remote_client.wait_completion().await?;
}
Ok(())
}

View File

@@ -422,10 +422,6 @@ impl DeleteTimelineFlow {
pub(crate) fn is_finished(&self) -> bool {
matches!(self, Self::Finished)
}
pub(crate) fn is_not_started(&self) -> bool {
matches!(self, Self::NotStarted)
}
}
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);

View File

@@ -1,540 +0,0 @@
use std::sync::Arc;
use super::{layer_manager::LayerManager, Timeline};
use crate::{
context::{DownloadBehavior, RequestContext},
task_mgr::TaskKind,
tenant::{
storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
Tenant,
},
virtual_file::{MaybeFatalIo, VirtualFile},
};
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
#[derive(Debug, thiserror::Error)]
pub(crate) enum Error {
#[error("no ancestors")]
NoAncestor,
#[error("too many ancestors")]
TooManyAncestors,
#[error("shutting down, please retry later")]
ShuttingDown,
#[error("flushing failed")]
FlushAncestor(#[source] anyhow::Error),
#[error("layer download failed")]
RewrittenDeltaDownloadFailed(#[source] anyhow::Error),
#[error("copying LSN prefix locally failed")]
CopyDeltaPrefix(#[source] anyhow::Error),
#[error("upload rewritten layer")]
UploadRewritten(#[source] anyhow::Error),
#[error("ancestor is already being detached by: {}", .0)]
OtherTimelineDetachOngoing(TimelineId),
#[error("remote copying layer failed")]
CopyFailed(#[source] anyhow::Error),
#[error("unexpected error")]
Unexpected(#[source] anyhow::Error),
}
pub(crate) struct PreparedTimelineDetach {
layers: Vec<Layer>,
}
/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments.
#[derive(Debug)]
pub(crate) struct Options {
pub(crate) rewrite_concurrency: std::num::NonZeroUsize,
pub(crate) copy_concurrency: std::num::NonZeroUsize,
}
impl Default for Options {
fn default() -> Self {
Self {
rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
}
}
}
/// See [`Timeline::prepare_to_detach_from_ancestor`]
pub(super) async fn prepare(
detached: &Arc<Timeline>,
tenant: &Tenant,
options: Options,
ctx: &RequestContext,
) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
use Error::*;
if detached.remote_client.as_ref().is_none() {
unimplemented!("no new code for running without remote storage");
}
let Some((ancestor, ancestor_lsn)) = detached
.ancestor_timeline
.as_ref()
.map(|tl| (tl.clone(), detached.ancestor_lsn))
else {
return Err(NoAncestor);
};
if !ancestor_lsn.is_valid() {
return Err(NoAncestor);
}
if ancestor.ancestor_timeline.is_some() {
// non-technical requirement; we could flatten N ancestors just as easily but we chose
// not to
return Err(TooManyAncestors);
}
// before we acquire the gate, we must mark the ancestor as having a detach operation
// ongoing which will block other concurrent detach operations so we don't get to ackward
// situations where there would be two branches trying to reparent earlier branches.
let (guard, barrier) = completion::channel();
{
let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
if let Some((tl, other)) = guard.as_ref() {
if !other.is_ready() {
return Err(OtherTimelineDetachOngoing(*tl));
}
}
*guard = Some((detached.timeline_id, barrier));
}
let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
if ancestor_lsn >= ancestor.get_disk_consistent_lsn() {
let span =
tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id);
async {
let started_at = std::time::Instant::now();
let freeze_and_flush = ancestor.freeze_and_flush0();
let mut freeze_and_flush = std::pin::pin!(freeze_and_flush);
let res =
tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush)
.await;
let res = match res {
Ok(res) => res,
Err(_elapsed) => {
tracing::info!("freezing and flushing ancestor is still ongoing");
freeze_and_flush.await
}
};
res.map_err(FlushAncestor)?;
// we do not need to wait for uploads to complete but we do need `struct Layer`,
// copying delta prefix is unsupported currently for `InMemoryLayer`.
tracing::info!(
elapsed_ms = started_at.elapsed().as_millis(),
"froze and flushed the ancestor"
);
Ok(())
}
.instrument(span)
.await?;
}
let end_lsn = ancestor_lsn + 1;
let (filtered_layers, straddling_branchpoint, rest_of_historic) = {
// we do not need to start from our layers, because they can only be layers that come
// *after* ancestor_lsn
let layers = tokio::select! {
guard = ancestor.layers.read() => guard,
_ = detached.cancel.cancelled() => {
return Err(ShuttingDown);
}
_ = ancestor.cancel.cancelled() => {
return Err(ShuttingDown);
}
};
// between retries, these can change if compaction or gc ran in between. this will mean
// we have to redo work.
partition_work(ancestor_lsn, &layers)
};
// TODO: layers are already sorted by something: use that to determine how much of remote
// copies are already done.
tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");
// TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
let mut new_layers: Vec<Layer> =
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
{
tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
let mut tasks = tokio::task::JoinSet::new();
let mut wrote_any = false;
let limiter = Arc::new(tokio::sync::Semaphore::new(
options.rewrite_concurrency.get(),
));
for layer in straddling_branchpoint {
let limiter = limiter.clone();
let timeline = detached.clone();
let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
tasks.spawn(async move {
let _permit = limiter.acquire().await;
let copied =
upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
.await?;
Ok(copied)
});
}
while let Some(res) = tasks.join_next().await {
match res {
Ok(Ok(Some(copied))) => {
wrote_any = true;
tracing::info!(layer=%copied, "rewrote and uploaded");
new_layers.push(copied);
}
Ok(Ok(None)) => {}
Ok(Err(e)) => return Err(e),
Err(je) => return Err(Unexpected(je.into())),
}
}
// FIXME: the fsync should be mandatory, after both rewrites and copies
if wrote_any {
let timeline_dir = VirtualFile::open(
&detached
.conf
.timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
)
.await
.fatal_err("VirtualFile::open for timeline dir fsync");
timeline_dir
.sync_all()
.await
.fatal_err("VirtualFile::sync_all timeline dir");
}
}
let mut tasks = tokio::task::JoinSet::new();
let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));
for adopted in rest_of_historic {
let limiter = limiter.clone();
let timeline = detached.clone();
tasks.spawn(
async move {
let _permit = limiter.acquire().await;
let owned =
remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
tracing::info!(layer=%owned, "remote copied");
Ok(owned)
}
.in_current_span(),
);
}
while let Some(res) = tasks.join_next().await {
match res {
Ok(Ok(owned)) => {
new_layers.push(owned);
}
Ok(Err(failed)) => {
return Err(failed);
}
Err(je) => return Err(Unexpected(je.into())),
}
}
// TODO: fsync directory again if we hardlinked something
let prepared = PreparedTimelineDetach { layers: new_layers };
Ok((guard, prepared))
}
fn partition_work(
ancestor_lsn: Lsn,
source_layermap: &LayerManager,
) -> (usize, Vec<Layer>, Vec<Layer>) {
let mut straddling_branchpoint = vec![];
let mut rest_of_historic = vec![];
let mut later_by_lsn = 0;
for desc in source_layermap.layer_map().iter_historic_layers() {
// off by one chances here:
// - start is inclusive
// - end is exclusive
if desc.lsn_range.start > ancestor_lsn {
later_by_lsn += 1;
continue;
}
let target = if desc.lsn_range.start <= ancestor_lsn
&& desc.lsn_range.end > ancestor_lsn
&& desc.is_delta
{
// TODO: image layer at Lsn optimization
&mut straddling_branchpoint
} else {
&mut rest_of_historic
};
target.push(source_layermap.get_from_desc(&desc));
}
(later_by_lsn, straddling_branchpoint, rest_of_historic)
}
async fn upload_rewritten_layer(
end_lsn: Lsn,
layer: &Layer,
target: &Arc<Timeline>,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<Option<Layer>, Error> {
use Error::UploadRewritten;
let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?;
let Some(copied) = copied else {
return Ok(None);
};
// FIXME: better shuttingdown error
target
.remote_client
.as_ref()
.unwrap()
.upload_layer_file(&copied, cancel)
.await
.map_err(UploadRewritten)?;
Ok(Some(copied.into()))
}
async fn copy_lsn_prefix(
end_lsn: Lsn,
layer: &Layer,
target_timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> Result<Option<ResidentLayer>, Error> {
use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed};
tracing::debug!(%layer, %end_lsn, "copying lsn prefix");
let mut writer = DeltaLayerWriter::new(
target_timeline.conf,
target_timeline.timeline_id,
target_timeline.tenant_shard_id,
layer.layer_desc().key_range.start,
layer.layer_desc().lsn_range.start..end_lsn,
)
.await
.map_err(CopyDeltaPrefix)?;
let resident = layer
.download_and_keep_resident()
.await
// likely shutdown
.map_err(RewrittenDeltaDownloadFailed)?;
let records = resident
.copy_delta_prefix(&mut writer, end_lsn, ctx)
.await
.map_err(CopyDeltaPrefix)?;
drop(resident);
tracing::debug!(%layer, records, "copied records");
if records == 0 {
drop(writer);
// TODO: we might want to store an empty marker in remote storage for this
// layer so that we will not needlessly walk `layer` on repeated attempts.
Ok(None)
} else {
// reuse the key instead of adding more holes between layers by using the real
// highest key in the layer.
let reused_highest_key = layer.layer_desc().key_range.end;
let copied = writer
.finish(reused_highest_key, target_timeline, ctx)
.await
.map_err(CopyDeltaPrefix)?;
tracing::debug!(%layer, %copied, "new layer produced");
Ok(Some(copied))
}
}
/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote
/// storage on successful return without the adopted layer being added to `index_part.json`.
async fn remote_copy(
adopted: &Layer,
adoptee: &Arc<Timeline>,
generation: Generation,
cancel: &CancellationToken,
) -> Result<Layer, Error> {
use Error::CopyFailed;
// depending if Layer::keep_resident we could hardlink
let mut metadata = adopted.metadata();
debug_assert!(metadata.generation <= generation);
metadata.generation = generation;
let owned = crate::tenant::storage_layer::Layer::for_evicted(
adoptee.conf,
adoptee,
adopted.layer_desc().layer_name(),
metadata,
);
// FIXME: better shuttingdown error
adoptee
.remote_client
.as_ref()
.unwrap()
.copy_timeline_layer(adopted, &owned, cancel)
.await
.map(move |()| owned)
.map_err(CopyFailed)
}
/// See [`Timeline::complete_detaching_timeline_ancestor`].
pub(super) async fn complete(
detached: &Arc<Timeline>,
tenant: &Tenant,
prepared: PreparedTimelineDetach,
_ctx: &RequestContext,
) -> Result<Vec<TimelineId>, anyhow::Error> {
let rtc = detached
.remote_client
.as_ref()
.expect("has to have a remote timeline client for timeline ancestor detach");
let PreparedTimelineDetach { layers } = prepared;
let ancestor = detached
.get_ancestor_timeline()
.expect("must still have a ancestor");
let ancestor_lsn = detached.get_ancestor_lsn();
// publish the prepared layers before we reparent any of the timelines, so that on restart
// reparented timelines find layers. also do the actual detaching.
//
// if we crash after this operation, we will at least come up having detached a timeline, but
// we cannot go back and reparent the timelines which would had been reparented in normal
// execution.
//
// this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
// which could give us a completely wrong layer combination.
rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
&layers,
(ancestor.timeline_id, ancestor_lsn),
)
.await?;
let mut tasks = tokio::task::JoinSet::new();
// because we are now keeping the slot in progress, it is unlikely that there will be any
// timeline deletions during this time. if we raced one, then we'll just ignore it.
tenant
.timelines
.lock()
.unwrap()
.values()
.filter_map(|tl| {
if Arc::ptr_eq(tl, detached) {
return None;
}
if !tl.is_active() {
return None;
}
let tl_ancestor = tl.ancestor_timeline.as_ref()?;
let is_same = Arc::ptr_eq(&ancestor, tl_ancestor);
let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
let is_deleting = tl
.delete_progress
.try_lock()
.map(|flow| !flow.is_not_started())
.unwrap_or(true);
if is_same && is_earlier && !is_deleting {
Some(tl.clone())
} else {
None
}
})
.for_each(|timeline| {
// important in this scope: we are holding the Tenant::timelines lock
let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
let new_parent = detached.timeline_id;
tasks.spawn(
async move {
let res = timeline
.remote_client
.as_ref()
.expect("reparented has to have remote client because detached has one")
.schedule_reparenting_and_wait(&new_parent)
.await;
match res {
Ok(()) => Some(timeline),
Err(e) => {
// with the use of tenant slot, we no longer expect these.
tracing::warn!("reparenting failed: {e:#}");
None
}
}
}
.instrument(span),
);
});
let reparenting_candidates = tasks.len();
let mut reparented = Vec::with_capacity(tasks.len());
while let Some(res) = tasks.join_next().await {
match res {
Ok(Some(timeline)) => {
tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
reparented.push(timeline.timeline_id);
}
Ok(None) => {
// lets just ignore this for now. one or all reparented timelines could had
// started deletion, and that is fine.
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
// ignore; it's better to continue with a single reparenting failing (or even
// all of them) in order to get to the goal state.
//
// these timelines will never be reparentable, but they can be always detached as
// separate tree roots.
}
Err(je) => tracing::error!("unexpected join error: {je:?}"),
}
}
if reparenting_candidates != reparented.len() {
tracing::info!("failed to reparent some candidates");
}
Ok(reparented)
}

View File

@@ -6,13 +6,13 @@ use crate::{
self,
index::{IndexPart, LayerFileMetadata},
},
storage_layer::LayerName,
storage_layer::LayerFileName,
Generation,
},
METADATA_FILE_NAME,
};
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8Path;
use pageserver_api::shard::ShardIndex;
use std::{collections::HashMap, str::FromStr};
use utils::lsn::Lsn;
@@ -20,7 +20,7 @@ use utils::lsn::Lsn;
/// Identified files in the timeline directory.
pub(super) enum Discovered {
/// The only one we care about
Layer(LayerName, Utf8PathBuf, u64),
Layer(LayerFileName, u64),
/// Old ephmeral files from previous launches, should be removed
Ephemeral(String),
/// Old temporary timeline files, unsure what these really are, should be removed
@@ -43,10 +43,10 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
let direntry = direntry?;
let file_name = direntry.file_name().to_string();
let discovered = match LayerName::from_str(&file_name) {
let discovered = match LayerFileName::from_str(&file_name) {
Ok(file_name) => {
let file_size = direntry.metadata()?.len();
Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
Discovered::Layer(file_name, file_size)
}
Err(_) => {
if file_name == METADATA_FILE_NAME {
@@ -72,28 +72,6 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
Ok(ret)
}
/// Whereas `LayerFileMetadata` describes the metadata we would store in remote storage,
/// this structure extends it with metadata describing the layer's presence in local storage.
#[derive(Clone, Debug)]
pub(super) struct LocalLayerFileMetadata {
pub(super) metadata: LayerFileMetadata,
pub(super) local_path: Utf8PathBuf,
}
impl LocalLayerFileMetadata {
pub fn new(
local_path: Utf8PathBuf,
file_size: u64,
generation: Generation,
shard: ShardIndex,
) -> Self {
Self {
local_path,
metadata: LayerFileMetadata::new(file_size, generation, shard),
}
}
}
/// Decision on what to do with a layer file after considering its local and remote metadata.
#[derive(Clone, Debug)]
pub(super) enum Decision {
@@ -102,11 +80,11 @@ pub(super) enum Decision {
/// The layer is present locally, but local metadata does not match remote; we must
/// delete it and treat it as evicted.
UseRemote {
local: LocalLayerFileMetadata,
local: LayerFileMetadata,
remote: LayerFileMetadata,
},
/// The layer is present locally, and metadata matches.
UseLocal(LocalLayerFileMetadata),
UseLocal(LayerFileMetadata),
}
/// A layer needs to be left out of the layer map.
@@ -114,42 +92,39 @@ pub(super) enum Decision {
pub(super) enum DismissedLayer {
/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
Future {
/// `None` if the layer is only known through [`IndexPart`].
local: Option<LocalLayerFileMetadata>,
/// The local metadata. `None` if the layer is only known through [`IndexPart`].
local: Option<LayerFileMetadata>,
},
/// The layer only exists locally.
///
/// In order to make crash safe updates to layer map, we must dismiss layers which are only
/// found locally or not yet included in the remote `index_part.json`.
LocalOnly(LocalLayerFileMetadata),
LocalOnly(LayerFileMetadata),
}
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
pub(super) fn reconcile(
discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
discovered: Vec<(LayerFileName, u64)>,
index_part: Option<&IndexPart>,
disk_consistent_lsn: Lsn,
generation: Generation,
shard: ShardIndex,
) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
use Decision::*;
// name => (local_metadata, remote_metadata)
type Collected =
HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
// name => (local, remote)
type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
let mut discovered = discovered
.into_iter()
.map(|(layer_name, local_path, file_size)| {
.map(|(name, file_size)| {
(
layer_name,
name,
// The generation and shard here will be corrected to match IndexPart in the merge below, unless
// it is not in IndexPart, in which case using our current generation makes sense
// because it will be uploaded in this generation.
(
Some(LocalLayerFileMetadata::new(
local_path, file_size, generation, shard,
)),
Some(LayerFileMetadata::new(file_size, generation, shard)),
None,
),
)
@@ -178,7 +153,7 @@ pub(super) fn reconcile(
Err(DismissedLayer::Future { local })
} else {
match (local, remote) {
(Some(local), Some(remote)) if local.metadata != remote => {
(Some(local), Some(remote)) if local != remote => {
Ok(UseRemote { local, remote })
}
(Some(x), Some(_)) => Ok(UseLocal(x)),
@@ -202,12 +177,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
}
pub(super) fn cleanup_local_file_for_remote(
local: &LocalLayerFileMetadata,
path: &Utf8Path,
local: &LayerFileMetadata,
remote: &LayerFileMetadata,
) -> anyhow::Result<()> {
let local_size = local.metadata.file_size();
let local_size = local.file_size();
let remote_size = remote.file_size();
let path = &local.local_path;
let file_name = path.file_name().expect("must be file path");
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
@@ -224,7 +199,7 @@ pub(super) fn cleanup_local_file_for_remote(
pub(super) fn cleanup_future_layer(
path: &Utf8Path,
name: &LayerName,
name: &LayerFileName,
disk_consistent_lsn: Lsn,
) -> anyhow::Result<()> {
// future image layers are allowed to be produced always for not yet flushed to disk
@@ -236,14 +211,12 @@ pub(super) fn cleanup_future_layer(
}
pub(super) fn cleanup_local_only_file(
name: &LayerName,
local: &LocalLayerFileMetadata,
path: &Utf8Path,
name: &LayerFileName,
local: &LayerFileMetadata,
) -> anyhow::Result<()> {
let kind = name.kind();
tracing::info!(
"found local-only {kind} layer {name}, metadata {:?}",
local.metadata
);
std::fs::remove_file(&local.local_path)?;
tracing::info!("found local-only {kind} layer {name}, metadata {local:?}");
std::fs::remove_file(path)?;
Ok(())
}

View File

@@ -205,24 +205,6 @@ impl LayerManager {
updates.flush();
}
/// Called when compaction is completed.
pub(crate) fn rewrite_layers(
&mut self,
rewrite_layers: &[(Layer, ResidentLayer)],
drop_layers: &[Layer],
_metrics: &TimelineMetrics,
) {
let mut updates = self.layer_map.batch_update();
// TODO: implement rewrites (currently this code path only used for drops)
assert!(rewrite_layers.is_empty());
for l in drop_layers {
Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
}
updates.flush();
}
/// Called when garbage collect has selected the layers to be removed.
pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
let mut updates = self.layer_map.batch_update();
@@ -294,7 +276,7 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
self.0
.get(&desc.key())
.with_context(|| format!("get layer from desc: {}", desc.layer_name()))
.with_context(|| format!("get layer from desc: {}", desc.filename()))
.expect("not found")
.clone()
}

View File

@@ -1,9 +1,8 @@
use super::storage_layer::LayerName;
use super::storage_layer::LayerFileName;
use super::storage_layer::ResidentLayer;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::remote_timeline_client::index::Lineage;
use std::collections::{HashMap, VecDeque};
use std::fmt::Debug;
@@ -46,7 +45,7 @@ pub(crate) struct UploadQueueInitialized {
/// All layer files stored in the remote storage, taking into account all
/// in-progress and queued operations
pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,
pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
/// How many file uploads or deletions been scheduled, since the
/// last (scheduling of) metadata index upload?
@@ -57,9 +56,6 @@ pub(crate) struct UploadQueueInitialized {
/// DANGER: do not return to outside world, e.g., safekeepers.
pub(crate) latest_metadata: TimelineMetadata,
/// Part of the flattened "next" `index_part.json`.
pub(crate) latest_lineage: Lineage,
/// `disk_consistent_lsn` from the last metadata file that was successfully
/// uploaded. `Lsn(0)` if nothing was uploaded yet.
/// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -93,7 +89,7 @@ pub(crate) struct UploadQueueInitialized {
/// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
/// bug causing leaks, then it's better to not leave this enabled for production builds.
#[cfg(feature = "testing")]
pub(crate) dangling_files: HashMap<LayerName, Generation>,
pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
/// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
pub(crate) shutting_down: bool,
@@ -175,7 +171,6 @@ impl UploadQueue {
latest_files: HashMap::new(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: metadata.clone(),
latest_lineage: Lineage::default(),
projected_remote_consistent_lsn: None,
visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
// what follows are boring default initializations
@@ -223,7 +218,6 @@ impl UploadQueue {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part.metadata.clone(),
latest_lineage: index_part.lineage.clone(),
projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
visible_remote_consistent_lsn: Arc::new(
index_part.metadata.disk_consistent_lsn().into(),
@@ -287,7 +281,7 @@ pub(crate) struct UploadTask {
/// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
#[derive(Debug)]
pub(crate) struct Delete {
pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>,
}
#[derive(Debug)]
@@ -296,7 +290,7 @@ pub(crate) enum UploadOp {
UploadLayer(ResidentLayer, LayerFileMetadata),
/// Upload the metadata file
UploadMetadata(Box<IndexPart>, Lsn),
UploadMetadata(IndexPart, Lsn),
/// Delete layer files
Delete(Delete),

View File

@@ -14,8 +14,7 @@ OBJS = \
relsize_cache.o \
walproposer.o \
walproposer_pg.o \
control_plane_connector.o \
walsender_hooks.o
control_plane_connector.o
PG_CPPFLAGS = -I$(libpq_srcdir)
SHLIB_LINK_INTERNAL = $(libpq)

View File

@@ -49,7 +49,7 @@ char *neon_auth_token;
int readahead_buffer_size = 128;
int flush_every_n_requests = 8;
int neon_protocol_version = 2;
int neon_protocol_version = 1;
static int n_reconnect_attempts = 0;
static int max_reconnect_attempts = 60;
@@ -860,7 +860,7 @@ pg_init_libpagestore(void)
"Version of compute<->page server protocol",
NULL,
&neon_protocol_version,
2, /* use protocol version 2 */
1, /* default to old protocol for now */
1, /* min */
2, /* max */
PGC_SU_BACKEND,

View File

@@ -34,7 +34,6 @@
#include "walproposer.h"
#include "pagestore_client.h"
#include "control_plane_connector.h"
#include "walsender_hooks.h"
PG_MODULE_MAGIC;
void _PG_init(void);
@@ -266,6 +265,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
}
}
void
_PG_init(void)
{
@@ -279,7 +279,6 @@ _PG_init(void)
pg_init_libpagestore();
pg_init_walproposer();
WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitLogicalReplicationMonitor();

View File

@@ -36,7 +36,10 @@
static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
static void NeonWALReaderResetRemote(NeonWALReader *state);
static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
static void neon_wal_segment_close(NeonWALReader *state);
static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
TimeLineID tli);
@@ -79,9 +82,8 @@ struct NeonWALReader
XLogRecPtr req_lsn;
Size req_len;
Size req_progress;
char donor_conninfo[MAXCONNINFO];
WalProposer *wp; /* we learn donor through walproposer */
char donor_name[64]; /* saved donor safekeeper name for logging */
XLogRecPtr donor_lsn;
/* state of connection to safekeeper */
NeonWALReaderRemoteState rem_state;
WalProposerConn *wp_conn;
@@ -105,7 +107,7 @@ struct NeonWALReader
/* palloc and initialize NeonWALReader */
NeonWALReader *
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix)
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
{
NeonWALReader *reader;
@@ -121,6 +123,8 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
reader->seg.ws_tli = 0;
reader->segcxt.ws_segsize = wal_segment_size;
reader->wp = wp;
reader->rem_state = RS_NONE;
if (log_prefix)
@@ -200,16 +204,21 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
{
if (state->rem_state == RS_NONE)
{
if (!NeonWALReaderUpdateDonor(state))
XLogRecPtr donor_lsn;
/* no connection yet; start one */
Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
if (donor == NULL)
{
snprintf(state->err_msg, sizeof(state->err_msg),
"failed to establish remote connection to fetch WAL: no donor available");
return NEON_WALREAD_ERROR;
}
/* no connection yet; start one */
nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn));
state->wp_conn = libpqwp_connect_start(state->donor_conninfo);
snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
state->wp_conn = libpqwp_connect_start(donor->conninfo);
if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
{
snprintf(state->err_msg, sizeof(state->err_msg),
@@ -242,22 +251,10 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
{
/* connection successfully established */
char start_repl_query[128];
term_t term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm);
/*
* Set elected walproposer's term to pull only data from
* its history. Note: for logical walsender it means we
* might stream WAL not yet committed by safekeepers. It
* would be cleaner to fix this.
*
* mineLastElectedTerm shouldn't be 0 at this point
* because we checked above that donor exists and it
* appears only after successfull election.
*/
Assert(term > 0);
snprintf(start_repl_query, sizeof(start_repl_query),
"START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
LSN_FORMAT_ARGS(startptr), term);
LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
state->donor_name, start_repl_query);
if (!libpqwp_send_query(state->wp_conn, start_repl_query))
@@ -407,10 +404,6 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou
state->req_lsn = InvalidXLogRecPtr;
state->req_len = 0;
state->req_progress = 0;
/* Update the current segment info. */
state->seg.ws_tli = tli;
return NEON_WALREAD_SUCCESS;
}
}
@@ -533,7 +526,7 @@ err:
}
/* reset remote connection and request in progress */
void
static void
NeonWALReaderResetRemote(NeonWALReader *state)
{
state->req_lsn = InvalidXLogRecPtr;
@@ -698,25 +691,13 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
return true;
}
XLogRecPtr
NeonWALReaderGetRemLsn(NeonWALReader *state)
{
return state->rem_lsn;
}
const WALOpenSegment *
NeonWALReaderGetSegment(NeonWALReader *state)
{
return &state->seg;
}
/*
* Copy of vanilla wal_segment_open, but returns false in case of error instead
* of ERROR, with errno set.
*
* XLogReaderRoutine->segment_open callback for local pg_wal files
*/
bool
static bool
neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
TimeLineID *tli_p)
{
@@ -743,7 +724,7 @@ is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
}
/* copy of vanilla wal_segment_close with NeonWALReader */
void
static void
neon_wal_segment_close(NeonWALReader *state)
{
if (state->seg.ws_file >= 0)
@@ -759,19 +740,3 @@ NeonWALReaderErrMsg(NeonWALReader *state)
{
return state->err_msg;
}
/*
* Returns true if there is a donor, and false otherwise
*/
bool
NeonWALReaderUpdateDonor(NeonWALReader *state)
{
WalproposerShmemState *wps = GetWalpropShmemState();
SpinLockAcquire(&wps->mutex);
memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name));
memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo));
state->donor_lsn = wps->donor_lsn;
SpinLockRelease(&wps->mutex);
return state->donor_name[0] != '\0';
}

View File

@@ -19,19 +19,12 @@ typedef enum
NEON_WALREAD_ERROR,
} NeonWALReadResult;
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix);
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
extern void NeonWALReaderFree(NeonWALReader *state);
extern void NeonWALReaderResetRemote(NeonWALReader *state);
extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
extern uint32 NeonWALReaderEvents(NeonWALReader *state);
extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
extern char *NeonWALReaderErrMsg(NeonWALReader *state);
extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state);
extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state);
extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
extern void neon_wal_segment_close(NeonWALReader *state);
extern bool NeonWALReaderUpdateDonor(NeonWALReader *state);
#endif /* __NEON_WALREADER_H__ */

View File

@@ -80,7 +80,7 @@ static int CompareLsn(const void *a, const void *b);
static char *FormatSafekeeperState(Safekeeper *sk);
static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
static char *FormatEvents(WalProposer *wp, uint32 events);
static void UpdateDonorShmem(WalProposer *wp);
WalProposer *
WalProposerCreate(WalProposerConfig *config, walproposer_api api)
@@ -922,8 +922,7 @@ static void
DetermineEpochStartLsn(WalProposer *wp)
{
TermHistory *dth;
int n_ready = 0;
WalproposerShmemState *walprop_shared;
int n_ready = 0;
wp->propEpochStartLsn = InvalidXLogRecPtr;
wp->donorEpoch = 0;
@@ -965,18 +964,16 @@ DetermineEpochStartLsn(WalProposer *wp)
if (n_ready < wp->quorum)
{
/*
* This is a rare case that can be triggered if safekeeper has voted
* and disconnected. In this case, its state will not be SS_IDLE and
* its vote cannot be used, because we clean up `voteResponse` in
* `ShutdownConnection`.
* This is a rare case that can be triggered if safekeeper has voted and disconnected.
* In this case, its state will not be SS_IDLE and its vote cannot be used, because
* we clean up `voteResponse` in `ShutdownConnection`.
*/
wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready);
}
/*
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are
* bootstrapping and nothing was committed yet. Start streaming then from
* the basebackup LSN.
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
* and nothing was committed yet. Start streaming then from the basebackup LSN.
*/
if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers)
{
@@ -987,12 +984,11 @@ DetermineEpochStartLsn(WalProposer *wp)
}
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
}
pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn);
/*
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so
* it should never be zero at this point, if we know timelineStartLsn.
*
* Safekeepers are setting truncateLsn after timelineStartLsn is known, so it
* should never be zero at this point, if we know timelineStartLsn.
*
* timelineStartLsn can be zero only on the first syncSafekeepers run.
*/
Assert((wp->truncateLsn != InvalidXLogRecPtr) ||
@@ -1026,9 +1022,10 @@ DetermineEpochStartLsn(WalProposer *wp)
* since which we are going to write according to the consensus. If not,
* we must bail out, as clog and other non rel data is inconsistent.
*/
walprop_shared = wp->api.get_shmem_state(wp);
if (!wp->config->syncSafekeepers)
{
WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp);
/*
* Basebackup LSN always points to the beginning of the record (not
* the page), as StartupXLOG most probably wants it this way.
@@ -1043,7 +1040,7 @@ DetermineEpochStartLsn(WalProposer *wp)
* compute (who could generate WAL) is ok.
*/
if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term ==
pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm))))
walprop_shared->mineLastElectedTerm)))
{
/*
* Panic to restart PG as we need to retake basebackup.
@@ -1057,8 +1054,8 @@ DetermineEpochStartLsn(WalProposer *wp)
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
}
}
walprop_shared->mineLastElectedTerm = wp->propTerm;
}
pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm);
}
/*
@@ -1108,13 +1105,9 @@ SendProposerElected(Safekeeper *sk)
{
/* safekeeper is empty or no common point, start from the beginning */
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u",
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
/*
* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline
* is created manually (test_s3_wal_replay)
*/
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
}
else
@@ -1184,12 +1177,6 @@ StartStreaming(Safekeeper *sk)
sk->active_state = SS_ACTIVE_SEND;
sk->streamingAt = sk->startStreamingAt;
/*
* Donors can only be in SS_ACTIVE state, so we potentially update the
* donor when we switch one to SS_ACTIVE.
*/
UpdateDonorShmem(sk->wp);
/* event set will be updated inside SendMessageToNode */
SendMessageToNode(sk);
}
@@ -1581,17 +1568,17 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
* none if it doesn't exist. donor_lsn is set to end position of the donor to
* the best of our knowledge.
*/
static void
UpdateDonorShmem(WalProposer *wp)
Safekeeper *
GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
{
Safekeeper *donor = NULL;
int i;
XLogRecPtr donor_lsn = InvalidXLogRecPtr;
*donor_lsn = InvalidXLogRecPtr;
if (wp->n_votes < wp->quorum)
{
wp_log(WARNING, "UpdateDonorShmem called before elections are won");
return;
wp_log(WARNING, "GetDonor called before elections are won");
return NULL;
}
/*
@@ -1602,7 +1589,7 @@ UpdateDonorShmem(WalProposer *wp)
if (wp->safekeeper[wp->donor].state >= SS_IDLE)
{
donor = &wp->safekeeper[wp->donor];
donor_lsn = wp->propEpochStartLsn;
*donor_lsn = wp->propEpochStartLsn;
}
/*
@@ -1614,19 +1601,13 @@ UpdateDonorShmem(WalProposer *wp)
{
Safekeeper *sk = &wp->safekeeper[i];
if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn)
if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
{
donor = sk;
donor_lsn = sk->appendResponse.flushLsn;
*donor_lsn = sk->appendResponse.flushLsn;
}
}
if (donor == NULL)
{
wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping");
return;
}
wp->api.update_donor(wp, donor, donor_lsn);
return donor;
}
/*
@@ -1636,7 +1617,7 @@ static void
HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
{
XLogRecPtr candidateTruncateLsn;
XLogRecPtr newCommitLsn;
XLogRecPtr newCommitLsn;
newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
if (newCommitLsn > wp->commitLsn)
@@ -1646,7 +1627,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk)
BroadcastAppendRequest(wp);
}
/*
/*
* Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown().
* The last one will terminate the process if the shutdown is requested
* and WAL is committed by the quorum. BroadcastAppendRequest() should be

View File

@@ -284,19 +284,14 @@ typedef struct PageserverFeedback
typedef struct WalproposerShmemState
{
pg_atomic_uint64 propEpochStartLsn;
char donor_name[64];
char donor_conninfo[MAXCONNINFO];
XLogRecPtr donor_lsn;
slock_t mutex;
pg_atomic_uint64 mineLastElectedTerm;
term_t mineLastElectedTerm;
pg_atomic_uint64 backpressureThrottlingTime;
pg_atomic_uint64 currentClusterSize;
/* last feedback from each shard */
PageserverFeedback shard_ps_feedback[MAX_SHARDS];
int num_shards;
int num_shards;
/* aggregated feedback with min LSNs across shards */
PageserverFeedback min_ps_feedback;
@@ -470,9 +465,6 @@ typedef struct walproposer_api
/* Get pointer to the latest available WAL. */
XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp);
/* Update current donor info in WalProposer Shmem */
void (*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn);
/* Get current time. */
TimestampTz (*get_current_timestamp) (WalProposer *wp);
@@ -505,7 +497,7 @@ typedef struct walproposer_api
*
* On success, the data is placed in *buf. It is valid until the next call
* to this function.
*
*
* Returns PG_ASYNC_READ_FAIL on closed connection.
*/
PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
@@ -553,14 +545,13 @@ typedef struct walproposer_api
* Returns 0 if timeout is reached, 1 if some event happened. Updates
* events mask to indicate events and sets sk to the safekeeper which has
* an event.
*
*
* On timeout, events is set to WL_NO_EVENTS. On socket event, events is
* set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is
* closed, events is set to WL_SOCKET_READABLE.
*
* WL_SOCKET_WRITEABLE is usually set only when we need to flush the
* buffer. It can be returned only if caller asked for this event in the
* last *_event_set call.
*
* WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer.
* It can be returned only if caller asked for this event in the last *_event_set call.
*/
int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
@@ -580,9 +571,9 @@ typedef struct walproposer_api
void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
/*
* Called after every AppendResponse from the safekeeper. Used to
* propagate backpressure feedback and to confirm WAL persistence (has
* been commited on the quorum of safekeepers).
* Called after every AppendResponse from the safekeeper. Used to propagate
* backpressure feedback and to confirm WAL persistence (has been commited
* on the quorum of safekeepers).
*/
void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk);
@@ -725,14 +716,12 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
extern void WalProposerPoll(WalProposer *wp);
extern void WalProposerFree(WalProposer *wp);
extern WalproposerShmemState *GetWalpropShmemState();
/*
* WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
* recreate set from scratch, hence the export.
*/
extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
extern TimeLineID walprop_pg_get_timeline_id(void);
extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
#define WPEVENT 1337 /* special log level for walproposer internal

View File

@@ -85,6 +85,7 @@ static void walprop_pg_init_standalone_sync_safekeepers(void);
static void walprop_pg_init_walsender(void);
static void walprop_pg_init_bgworker(void);
static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
static TimeLineID walprop_pg_get_timeline_id(void);
static void walprop_pg_load_libpqwalreceiver(void);
static process_interrupts_callback_t PrevProcessInterruptsCallback;
@@ -93,8 +94,6 @@ static shmem_startup_hook_type prev_shmem_startup_hook_type;
static shmem_request_hook_type prev_shmem_request_hook = NULL;
static void walproposer_shmem_request(void);
#endif
static void WalproposerShmemInit_SyncSafekeeper(void);
static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
static void WalSndLoop(WalProposer *wp);
@@ -137,7 +136,6 @@ WalProposerSync(int argc, char *argv[])
WalProposer *wp;
init_walprop_config(true);
WalproposerShmemInit_SyncSafekeeper();
walprop_pg_init_standalone_sync_safekeepers();
walprop_pg_load_libpqwalreceiver();
@@ -283,8 +281,6 @@ WalproposerShmemInit(void)
{
memset(walprop_shared, 0, WalproposerShmemSize());
SpinLockInit(&walprop_shared->mutex);
pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
}
@@ -293,17 +289,6 @@ WalproposerShmemInit(void)
return found;
}
static void
WalproposerShmemInit_SyncSafekeeper(void)
{
walprop_shared = palloc(WalproposerShmemSize());
memset(walprop_shared, 0, WalproposerShmemSize());
SpinLockInit(&walprop_shared->mutex);
pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
}
#define BACK_PRESSURE_DELAY 10000L // 0.01 sec
static bool
@@ -414,13 +399,6 @@ nwp_shmem_startup_hook(void)
WalproposerShmemInit();
}
WalproposerShmemState *
GetWalpropShmemState()
{
Assert(walprop_shared != NULL);
return walprop_shared;
}
static WalproposerShmemState *
walprop_pg_get_shmem_state(WalProposer *wp)
{
@@ -453,15 +431,14 @@ record_pageserver_feedback(PageserverFeedback *ps_feedback)
for (int i = 0; i < walprop_shared->num_shards; i++)
{
PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i];
if (feedback->present)
{
if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn)
min_feedback.last_received_lsn = feedback->last_received_lsn;
if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn)
min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn;
if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn)
min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn;
}
@@ -574,7 +551,6 @@ static void
walprop_sigusr2(SIGNAL_ARGS)
{
int save_errno = errno;
got_SIGUSR2 = true;
SetLatch(MyLatch);
errno = save_errno;
@@ -622,7 +598,7 @@ walprop_pg_get_current_timestamp(WalProposer *wp)
return GetCurrentTimestamp();
}
TimeLineID
static TimeLineID
walprop_pg_get_timeline_id(void)
{
#if PG_VERSION_NUM >= 150000
@@ -641,20 +617,6 @@ walprop_pg_load_libpqwalreceiver(void)
wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
}
static void
walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn)
{
WalproposerShmemState *wps = wp->api.get_shmem_state(wp);
char donor_name[64];
pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port);
SpinLockAcquire(&wps->mutex);
memcpy(wps->donor_name, donor_name, sizeof(donor_name));
memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo));
wps->donor_lsn = donor_lsn;
SpinLockRelease(&wps->mutex);
}
/* Helper function */
static bool
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -755,6 +717,7 @@ walprop_connect_start(Safekeeper *sk)
{
Assert(sk->conn == NULL);
sk->conn = libpqwp_connect_start(sk->conninfo);
}
static WalProposerConnectPollStatusType
@@ -1128,7 +1091,7 @@ static void
StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
{
XLogRecPtr FlushPtr;
__attribute__((unused)) TimeLineID currTLI;
__attribute__((unused)) TimeLineID currTLI;
#if PG_VERSION_NUM < 150000
if (ThisTimeLineID == 0)
@@ -1332,13 +1295,116 @@ XLogBroadcastWalProposer(WalProposer *wp)
}
}
/*
Used to download WAL before basebackup for logical walsenders from sk, no longer
needed because walsender always uses neon_walreader.
*/
/* Download WAL before basebackup for logical walsenders from sk, if needed */
static bool
WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
{
char *err;
WalReceiverConn *wrconn;
WalRcvStreamOptions options;
char conninfo[MAXCONNINFO];
TimeLineID timeline;
XLogRecPtr startpos;
XLogRecPtr endpos;
startpos = GetLogRepRestartLSN(wp);
if (startpos == InvalidXLogRecPtr)
return true; /* recovery not needed */
endpos = wp->propEpochStartLsn;
timeline = wp->greetRequest.timeline;
if (!neon_auth_token)
{
memcpy(conninfo, sk->conninfo, MAXCONNINFO);
}
else
{
int written = 0;
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
if (written > MAXCONNINFO || written < 0)
wpg_log(FATAL, "could not append password to the safekeeper connection string");
}
#if PG_MAJORVERSION_NUM < 16
wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
#else
wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err);
#endif
if (!wrconn)
{
ereport(WARNING,
(errmsg("could not connect to WAL acceptor %s:%s: %s",
sk->host, sk->port,
err)));
return false;
}
wpg_log(LOG,
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
"%d",
sk->host, sk->port, (uint32) (startpos >> 32),
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
options.logical = false;
options.startpoint = startpos;
options.slotname = NULL;
options.proto.physical.startpointTLI = timeline;
if (walrcv_startstreaming(wrconn, &options))
{
XLogRecPtr rec_start_lsn;
XLogRecPtr rec_end_lsn = 0;
int len;
char *buf;
pgsocket wait_fd = PGINVALID_SOCKET;
while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0)
{
if (len == 0)
{
(void) WaitLatchOrSocket(
MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd,
-1, WAIT_EVENT_WAL_RECEIVER_MAIN);
}
else
{
Assert(buf[0] == 'w' || buf[0] == 'k');
if (buf[0] == 'k')
continue; /* keepalive */
memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS],
sizeof rec_start_lsn);
rec_start_lsn = pg_ntoh64(rec_start_lsn);
rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
/* write WAL to disk */
XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
ereport(DEBUG1,
(errmsg("Recover message %X/%X length %d",
LSN_FORMAT_ARGS(rec_start_lsn), len)));
if (rec_end_lsn >= endpos)
break;
}
}
ereport(LOG,
(errmsg("end of replication stream at %X/%X: %m",
LSN_FORMAT_ARGS(rec_end_lsn))));
walrcv_disconnect(wrconn);
/* failed to receive all WAL till endpos */
if (rec_end_lsn < endpos)
return false;
}
else
{
ereport(LOG,
(errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X",
timeline, (uint32) (startpos >> 32), (uint32) startpos)));
return false;
}
return true;
}
@@ -1479,7 +1545,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
Assert(!sk->xlogreader);
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix);
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
if (sk->xlogreader == NULL)
wpg_log(FATAL, "failed to allocate xlog reader");
}
@@ -1894,8 +1960,8 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
static void
walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
{
HotStandbyFeedback hsFeedback;
bool needToAdvanceSlot = false;
HotStandbyFeedback hsFeedback;
bool needToAdvanceSlot = false;
if (wp->config->syncSafekeepers)
return;
@@ -2029,25 +2095,22 @@ GetLogRepRestartLSN(WalProposer *wp)
return lrRestartLsn;
}
void
SetNeonCurrentClusterSize(uint64 size)
void SetNeonCurrentClusterSize(uint64 size)
{
pg_atomic_write_u64(&walprop_shared->currentClusterSize, size);
}
uint64
GetNeonCurrentClusterSize(void)
uint64 GetNeonCurrentClusterSize(void)
{
return pg_atomic_read_u64(&walprop_shared->currentClusterSize);
}
uint64 GetNeonCurrentClusterSize(void);
uint64 GetNeonCurrentClusterSize(void);
static const walproposer_api walprop_pg = {
.get_shmem_state = walprop_pg_get_shmem_state,
.start_streaming = walprop_pg_start_streaming,
.get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr,
.update_donor = walprop_pg_update_donor,
.get_current_timestamp = walprop_pg_get_current_timestamp,
.conn_error_message = walprop_error_message,
.conn_status = walprop_status,

View File

@@ -1,172 +0,0 @@
/*-------------------------------------------------------------------------
*
* walsender_hooks.c
*
* Implements XLogReaderRoutine in terms of NeonWALReader. Allows for
* fetching WAL from safekeepers, which normal xlogreader can't do.
*
*-------------------------------------------------------------------------
*/
#include "walsender_hooks.h"
#include "postgres.h"
#include "fmgr.h"
#include "access/xlogdefs.h"
#include "replication/walsender.h"
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "access/xlogreader.h"
#include "miscadmin.h"
#include "utils/wait_event.h"
#include "utils/guc.h"
#include "postmaster/interrupt.h"
#include "neon_walreader.h"
#include "walproposer.h"
static NeonWALReader *wal_reader = NULL;
extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
static XLogRecPtr
NeonWALReadWaitForWAL(XLogRecPtr loc)
{
while (!NeonWALReaderUpdateDonor(wal_reader))
{
pg_usleep(1000);
CHECK_FOR_INTERRUPTS();
}
return WalSndWaitForWal(loc);
}
static int
NeonWALPageRead(
XLogReaderState *xlogreader,
XLogRecPtr targetPagePtr,
int reqLen,
XLogRecPtr targetRecPtr,
char *readBuf)
{
XLogRecPtr rem_lsn;
/* Wait for flush pointer to advance past our request */
XLogRecPtr flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen);
int count;
if (flushptr < targetPagePtr + reqLen)
return -1;
/* Read at most XLOG_BLCKSZ bytes */
if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
count = XLOG_BLCKSZ;
else
count = flushptr - targetPagePtr;
/*
* Sometimes walsender requests non-monotonic sequences of WAL. If that's
* the case, we have to reset streaming from remote at the correct
* position. For example, walsender may try to verify the segment header
* when trying to read in the middle of it.
*/
rem_lsn = NeonWALReaderGetRemLsn(wal_reader);
if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn)
{
NeonWALReaderResetRemote(wal_reader);
}
for (;;)
{
NeonWALReadResult res = NeonWALRead(
wal_reader,
readBuf,
targetPagePtr,
count,
walprop_pg_get_timeline_id());
if (res == NEON_WALREAD_SUCCESS)
{
/*
* Setting ws_tli is required by the XLogReaderRoutine, it is used
* for segment name generation in error reports.
*
* ReadPageInternal updates ws_segno after calling cb on its own
* and XLogReaderRoutine description doesn't require it, but
* WALRead sets, let's follow it.
*/
xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli;
xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno;
/*
* ws_file doesn't exist in case of remote read, and isn't used by
* xlogreader except by WALRead on which we don't rely anyway.
*/
return count;
}
if (res == NEON_WALREAD_ERROR)
{
elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s",
LSN_FORMAT_ARGS(targetPagePtr),
reqLen,
NeonWALReaderErrMsg(wal_reader));
return -1;
}
/*
* Res is WOULDBLOCK, so we wait on the socket, recreating event set
* if necessary
*/
{
pgsocket sock = NeonWALReaderSocket(wal_reader);
uint32_t reader_events = NeonWALReaderEvents(wal_reader);
long timeout_ms = 1000;
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
if (ConfigReloadPending)
{
ConfigReloadPending = false;
ProcessConfigFile(PGC_SIGHUP);
}
WaitLatchOrSocket(
MyLatch,
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events,
sock,
timeout_ms,
WAIT_EVENT_WAL_SENDER_MAIN);
}
}
}
static void
NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p)
{
neon_wal_segment_open(wal_reader, nextSegNo, tli_p);
xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file;
}
static void
NeonWALReadSegmentClose(XLogReaderState *xlogreader)
{
neon_wal_segment_close(wal_reader);
xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file;
}
void
NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
{
if (!wal_reader)
{
XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
if (epochStartLsn == 0)
{
elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!");
}
wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] ");
}
xlr->page_read = NeonWALPageRead;
xlr->segment_open = NeonWALReadSegmentOpen;
xlr->segment_close = NeonWALReadSegmentClose;
}

View File

@@ -1,7 +0,0 @@
#ifndef __WALSENDER_HOOKS_H__
#define __WALSENDER_HOOKS_H__
struct XLogReaderRoutine;
void NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr);
#endif

2732
poetry.lock generated

File diff suppressed because one or more lines are too long

View File

@@ -40,7 +40,6 @@ hyper.workspace = true
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
http-body-util = { version = "0.1" }
indexmap.workspace = true
ipnet.workspace = true
itertools.workspace = true
lasso = { workspace = true, features = ["multi-threaded"] }

View File

@@ -13,7 +13,7 @@ use tokio_postgres::config::AuthKeys;
use tracing::{info, warn};
use crate::auth::credentials::check_peer_addr_is_in_list;
use crate::auth::{validate_password_and_exchange, AuthError};
use crate::auth::validate_password_and_exchange;
use crate::cache::Cached;
use crate::console::errors::GetAuthInfoError;
use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
@@ -23,7 +23,7 @@ use crate::intern::EndpointIdInt;
use crate::metrics::Metrics;
use crate::proxy::connect_compute::ComputeConnectBackend;
use crate::proxy::NeonOptions;
use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
use crate::stream::Stream;
use crate::{
auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -280,7 +280,6 @@ async fn auth_quirks(
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> auth::Result<ComputeCredentials> {
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the endpoint (project) name.
@@ -306,10 +305,6 @@ async fn auth_quirks(
if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
}
if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
return Err(AuthError::too_many_connections());
}
let cached_secret = match maybe_secret {
Some(secret) => secret,
None => api.get_role_secret(ctx, &info).await?,
@@ -422,7 +417,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
use BackendType::*;
@@ -434,16 +428,8 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
"performing authentication using the console"
);
let credentials = auth_quirks(
ctx,
&*api,
user_info,
client,
allow_cleartext,
config,
endpoint_rate_limiter,
)
.await?;
let credentials =
auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
BackendType::Console(api, credentials)
}
// NOTE: this auth backend doesn't use client credentials.
@@ -553,7 +539,7 @@ mod tests {
},
context::RequestMonitoring,
proxy::NeonOptions,
rate_limiter::{EndpointRateLimiter, RateBucketInfo},
rate_limiter::RateBucketInfo,
scram::ServerSecret,
stream::{PqStream, Stream},
};
@@ -713,20 +699,10 @@ mod tests {
_ => panic!("wrong message"),
}
});
let endpoint_rate_limiter =
Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
let _creds = auth_quirks(
&mut ctx,
&api,
user_info,
&mut stream,
false,
&CONFIG,
endpoint_rate_limiter,
)
.await
.unwrap();
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
.await
.unwrap();
handle.await.unwrap();
}
@@ -763,20 +739,10 @@ mod tests {
frontend::password_message(b"my-secret-password", &mut write).unwrap();
client.write_all(&write).await.unwrap();
});
let endpoint_rate_limiter =
Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
let _creds = auth_quirks(
&mut ctx,
&api,
user_info,
&mut stream,
true,
&CONFIG,
endpoint_rate_limiter,
)
.await
.unwrap();
let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
.await
.unwrap();
handle.await.unwrap();
}
@@ -814,20 +780,9 @@ mod tests {
client.write_all(&write).await.unwrap();
});
let endpoint_rate_limiter =
Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
let creds = auth_quirks(
&mut ctx,
&api,
user_info,
&mut stream,
true,
&CONFIG,
endpoint_rate_limiter,
)
.await
.unwrap();
let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
.await
.unwrap();
assert_eq!(creds.info.endpoint, "my-endpoint");

View File

@@ -27,7 +27,6 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
use proxy::redis::elasticache;
use proxy::redis::notifications;
use proxy::serverless::cancel_set::CancelSet;
use proxy::serverless::GlobalConnPoolOptions;
use proxy::usage_metrics;
@@ -144,9 +143,6 @@ struct ProxyCliArgs {
/// Can be given multiple times for different bucket sizes.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
endpoint_rps_limit: Vec<RateBucketInfo>,
/// Wake compute rate limiter max number of requests per second.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
wake_compute_limit: Vec<RateBucketInfo>,
/// Whether the auth rate limiter actually takes effect (for testing)
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
auth_rate_limit_enabled: bool,
@@ -157,7 +153,7 @@ struct ProxyCliArgs {
#[clap(long, default_value_t = 64)]
auth_rate_limit_ip_subnet: u8,
/// Redis rate limiter max number of requests per second.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
redis_rps_limit: Vec<RateBucketInfo>,
/// cache for `allowed_ips` (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
@@ -247,12 +243,6 @@ struct SqlOverHttpArgs {
/// increase memory used by the pool
#[clap(long, default_value_t = 128)]
sql_over_http_pool_shards: usize,
#[clap(long, default_value_t = 10000)]
sql_over_http_client_conn_threshold: u64,
#[clap(long, default_value_t = 64)]
sql_over_http_cancel_set_shards: usize,
}
#[tokio::main]
@@ -268,10 +258,6 @@ async fn main() -> anyhow::Result<()> {
build_tag: BUILD_TAG,
});
// add the current runtime to the collector
#[cfg(tokio_unstable)]
neon_metrics.tokio.add_current("proxy");
let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
Ok(t) => Some(t),
Err(e) => {
@@ -372,10 +358,6 @@ async fn main() -> anyhow::Result<()> {
proxy::metrics::CancellationSource::FromClient,
));
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
// client facing tasks. these will exit on error or on cancellation
// cancellation returns Ok(())
let mut client_tasks = JoinSet::new();
@@ -384,7 +366,6 @@ async fn main() -> anyhow::Result<()> {
proxy_listener,
cancellation_token.clone(),
cancellation_handler.clone(),
endpoint_rate_limiter.clone(),
));
// TODO: rename the argument to something like serverless.
@@ -399,7 +380,6 @@ async fn main() -> anyhow::Result<()> {
serverless_listener,
cancellation_token.clone(),
cancellation_handler.clone(),
endpoint_rate_limiter.clone(),
));
}
@@ -572,16 +552,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let url = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(url, http::new_client());
let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
let wake_compute_endpoint_rate_limiter =
Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit));
let api = console::provider::neon::Api::new(
endpoint,
caches,
locks,
wake_compute_endpoint_rate_limiter,
);
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
let api =
console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
let api = console::provider::ConsoleBackend::Console(api);
auth::BackendType::Console(MaybeOwned::Owned(api), ())
}
@@ -624,8 +599,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns,
},
cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards),
client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
};
let authentication_config = AuthenticationConfig {
scram_protocol_timeout: args.scram_protocol_timeout,

View File

@@ -1,7 +1,7 @@
use crate::{
auth::parse_endpoint_param,
cancellation::CancelClosure,
console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError},
console::{errors::WakeComputeError, messages::MetricsAuxInfo},
context::RequestMonitoring,
error::{ReportableError, UserFacingError},
metrics::{Metrics, NumDbConnectionsGuard},
@@ -34,9 +34,6 @@ pub enum ConnectionError {
#[error("{COULD_NOT_CONNECT}: {0}")]
WakeComputeError(#[from] WakeComputeError),
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}
impl UserFacingError for ConnectionError {
@@ -60,9 +57,6 @@ impl UserFacingError for ConnectionError {
None => err.to_string(),
},
WakeComputeError(err) => err.to_string_client(),
TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
_ => COULD_NOT_CONNECT.to_owned(),
}
}
@@ -78,7 +72,6 @@ impl ReportableError for ConnectionError {
ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
ConnectionError::WakeComputeError(e) => e.get_error_kind(),
ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
}
}
}

View File

@@ -2,7 +2,7 @@ use crate::{
auth::{self, backend::AuthRateLimiter},
console::locks::ApiLocks,
rate_limiter::RateBucketInfo,
serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
serverless::GlobalConnPoolOptions,
Host,
};
use anyhow::{bail, ensure, Context, Ok};
@@ -56,8 +56,6 @@ pub struct TlsConfig {
pub struct HttpConfig {
pub request_timeout: tokio::time::Duration,
pub pool_options: GlobalConnPoolOptions,
pub cancel_set: CancelSet,
pub client_conn_threshold: u64,
}
pub struct AuthenticationConfig {
@@ -538,9 +536,9 @@ pub struct RetryConfig {
impl RetryConfig {
/// Default options for RetryConfig.
/// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s.
/// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
"num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2";
"num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
/// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
/// Cplane has timeout of 60s on each request. 8m7s in total.
pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
@@ -594,7 +592,7 @@ impl ConcurrencyLockOptions {
pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
/// Default options for [`crate::console::provider::ApiLocks`].
pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
"shards=64,permits=10,epoch=10m,timeout=10ms";
"shards=64,permits=50,epoch=10m,timeout=500ms";
// pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";

View File

@@ -75,7 +75,6 @@ pub type ComputeReady = DatabaseInfo;
// TODO: replace with an http-based protocol.
struct MgmtHandler;
#[async_trait::async_trait]
impl postgres_backend::Handler<tokio::net::TcpStream> for MgmtHandler {
async fn process_query(
&mut self,

View File

@@ -12,7 +12,6 @@ use crate::{
compute,
config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
context::RequestMonitoring,
error::ReportableError,
intern::ProjectIdInt,
metrics::ApiLockMetrics,
scram, EndpointCacheKey,
@@ -31,8 +30,6 @@ pub mod errors {
};
use thiserror::Error;
use super::ApiLockError;
/// A go-to error message which doesn't leak any detail.
const REQUEST_FAILED: &str = "Console request failed";
@@ -79,7 +76,7 @@ pub mod errors {
}
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
}
_ => REQUEST_FAILED.to_owned(),
},
@@ -214,8 +211,8 @@ pub mod errors {
#[error("Too many connections attempts")]
TooManyConnections,
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
#[error("Timeout waiting to acquire wake compute lock")]
TimeoutError,
}
// This allows more useful interactions than `#[from]`.
@@ -225,6 +222,17 @@ pub mod errors {
}
}
impl From<tokio::sync::AcquireError> for WakeComputeError {
fn from(_: tokio::sync::AcquireError) -> Self {
WakeComputeError::TimeoutError
}
}
impl From<tokio::time::error::Elapsed> for WakeComputeError {
fn from(_: tokio::time::error::Elapsed) -> Self {
WakeComputeError::TimeoutError
}
}
impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
use WakeComputeError::*;
@@ -237,9 +245,7 @@ pub mod errors {
TooManyConnections => self.to_string(),
TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
}
}
}
@@ -250,7 +256,7 @@ pub mod errors {
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
WakeComputeError::ApiError(e) => e.get_error_kind(),
WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(),
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
}
}
}
@@ -450,23 +456,6 @@ pub struct ApiLocks<K> {
metrics: &'static ApiLockMetrics,
}
#[derive(Debug, thiserror::Error)]
pub enum ApiLockError {
#[error("lock was closed")]
AcquireError(#[from] tokio::sync::AcquireError),
#[error("permit could not be acquired")]
TimeoutError(#[from] tokio::time::error::Elapsed),
}
impl ReportableError for ApiLockError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiLockError::AcquireError(_) => crate::error::ErrorKind::Service,
ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
}
}
}
impl<K: Hash + Eq + Clone> ApiLocks<K> {
pub fn new(
name: &'static str,
@@ -486,7 +475,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
})
}
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, errors::WakeComputeError> {
if self.permits == 0 {
return Ok(WakeComputePermit { permit: None });
}

View File

@@ -26,7 +26,7 @@ pub struct Api {
endpoint: http::Endpoint,
pub caches: &'static ApiCaches,
pub locks: &'static ApiLocks<EndpointCacheKey>,
pub wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
jwt: String,
}
@@ -36,7 +36,7 @@ impl Api {
endpoint: http::Endpoint,
caches: &'static ApiCaches,
locks: &'static ApiLocks<EndpointCacheKey>,
wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Self {
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
Ok(v) => v,
@@ -46,7 +46,7 @@ impl Api {
endpoint,
caches,
locks,
wake_compute_endpoint_rate_limiter,
endpoint_rate_limiter,
jwt,
}
}
@@ -283,7 +283,7 @@ impl super::Api for Api {
// check rate limit
if !self
.wake_compute_endpoint_rate_limiter
.endpoint_rate_limiter
.check(user_info.endpoint.normalize().into(), 1)
{
return Err(WakeComputeError::TooManyConnections);

View File

@@ -19,7 +19,6 @@ use crate::{
metrics::{Metrics, NumClientConnectionsGuard},
protocol2::read_proxy_protocol,
proxy::handshake::{handshake, HandshakeData},
rate_limiter::EndpointRateLimiter,
stream::{PqStream, Stream},
EndpointCacheKey,
};
@@ -62,7 +61,6 @@ pub async fn task_main(
listener: tokio::net::TcpListener,
cancellation_token: CancellationToken,
cancellation_handler: Arc<CancellationHandlerMain>,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("proxy has shut down");
@@ -88,7 +86,6 @@ pub async fn task_main(
let cancellation_handler = Arc::clone(&cancellation_handler);
tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
connections.spawn(async move {
let (socket, peer_addr) = match read_proxy_protocol(socket).await{
@@ -126,7 +123,6 @@ pub async fn task_main(
cancellation_handler,
socket,
ClientMode::Tcp,
endpoint_rate_limiter2,
conn_gauge,
)
.instrument(span.clone())
@@ -238,7 +234,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
cancellation_handler: Arc<CancellationHandlerMain>,
stream: S,
mode: ClientMode,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
conn_gauge: NumClientConnectionsGuard<'static>,
) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
info!(
@@ -248,6 +243,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let metrics = &Metrics::get().proxy;
let proto = ctx.protocol;
// let _client_gauge = metrics.client_connections.guard(proto);
let _request_gauge = metrics.connection_requests.guard(proto);
let tls = config.tls_config.as_ref();
@@ -290,7 +286,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
&mut stream,
mode.allow_cleartext(),
&config.authentication_config,
endpoint_rate_limiter,
)
.await
{

View File

@@ -86,8 +86,6 @@ impl ShouldRetry for compute::ConnectionError {
match self {
compute::ConnectionError::Postgres(err) => err.should_retry_database_address(),
compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(),
// the cache entry was not checked for validity
compute::ConnectionError::TooManyConnectionAttempts(_) => false,
_ => true,
}
}

View File

@@ -119,7 +119,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
WakeupFailureKind::ApiConsoleOtherError
}
WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
};
Metrics::get()
.proxy

View File

@@ -128,18 +128,12 @@ impl std::str::FromStr for RateBucketInfo {
}
impl RateBucketInfo {
pub const DEFAULT_SET: [Self; 3] = [
pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
Self::new(300, Duration::from_secs(1)),
Self::new(200, Duration::from_secs(60)),
Self::new(100, Duration::from_secs(600)),
];
pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
Self::new(500, Duration::from_secs(1)),
Self::new(300, Duration::from_secs(60)),
Self::new(200, Duration::from_secs(600)),
];
pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
info.sort_unstable_by_key(|info| info.interval);
let invalid = info
@@ -272,7 +266,7 @@ mod tests {
#[test]
fn default_rate_buckets() {
let mut defaults = RateBucketInfo::DEFAULT_SET;
let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
RateBucketInfo::validate(&mut defaults[..]).unwrap();
}
@@ -339,8 +333,11 @@ mod tests {
let rand = rand::rngs::StdRng::from_seed([1; 32]);
let hasher = BuildHasherDefault::<FxHasher>::default();
let limiter =
BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher);
let limiter = BucketRateLimiter::new_with_rand_and_hasher(
&RateBucketInfo::DEFAULT_ENDPOINT_SET,
rand,
hasher,
);
for i in 0..1_000_000 {
limiter.check(i, 1);
}

View File

@@ -3,7 +3,6 @@
//! Handles both SQL over HTTP and SQL over Websockets.
mod backend;
pub mod cancel_set;
mod conn_pool;
mod http_util;
mod json;
@@ -36,7 +35,6 @@ use crate::context::RequestMonitoring;
use crate::metrics::Metrics;
use crate::protocol2::read_proxy_protocol;
use crate::proxy::run_until_cancelled;
use crate::rate_limiter::EndpointRateLimiter;
use crate::serverless::backend::PoolingBackend;
use crate::serverless::http_util::{api_error_into_response, json_response};
@@ -55,7 +53,6 @@ pub async fn task_main(
ws_listener: TcpListener,
cancellation_token: CancellationToken,
cancellation_handler: Arc<CancellationHandlerMain>,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("websocket server has shut down");
@@ -84,7 +81,6 @@ pub async fn task_main(
let backend = Arc::new(PoolingBackend {
pool: Arc::clone(&conn_pool),
config,
endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
});
let tls_config = match config.tls_config.as_ref() {
@@ -113,38 +109,20 @@ pub async fn task_main(
let conn_id = uuid::Uuid::new_v4();
let http_conn_span = tracing::info_span!("http_conn", ?conn_id);
let n_connections = Metrics::get()
.proxy
.client_connections
.sample(crate::metrics::Protocol::Http);
tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check");
if n_connections > config.http_config.client_conn_threshold {
tracing::trace!("attempting to cancel a random connection");
if let Some(token) = config.http_config.cancel_set.take() {
tracing::debug!("cancelling a random connection");
token.cancel()
}
}
let conn_token = cancellation_token.child_token();
let conn = connection_handler(
config,
backend.clone(),
connections.clone(),
cancellation_handler.clone(),
endpoint_rate_limiter.clone(),
conn_token.clone(),
server.clone(),
tls_acceptor.clone(),
conn,
peer_addr,
)
.instrument(http_conn_span);
connections.spawn(async move {
let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token);
conn.await
});
connections.spawn(
connection_handler(
config,
backend.clone(),
connections.clone(),
cancellation_handler.clone(),
cancellation_token.clone(),
server.clone(),
tls_acceptor.clone(),
conn,
peer_addr,
)
.instrument(http_conn_span),
);
}
connections.wait().await;
@@ -166,7 +144,6 @@ async fn connection_handler(
backend: Arc<PoolingBackend>,
connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
cancellation_token: CancellationToken,
server: Builder<TokioExecutor>,
tls_acceptor: TlsAcceptor,
@@ -250,7 +227,6 @@ async fn connection_handler(
session_id,
peer_addr,
http_request_token,
endpoint_rate_limiter.clone(),
)
.in_current_span()
.map_ok_or_else(api_error_into_response, |r| r),
@@ -267,7 +243,6 @@ async fn connection_handler(
// On cancellation, trigger the HTTP connection handler to shut down.
let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
Either::Left((_cancelled, mut conn)) => {
tracing::debug!(%peer_addr, "cancelling connection");
conn.as_mut().graceful_shutdown();
conn.await
}
@@ -291,7 +266,6 @@ async fn request_handler(
peer_addr: IpAddr,
// used to cancel in-flight HTTP requests. not used to cancel websockets
http_cancellation_token: CancellationToken,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Result<Response<Full<Bytes>>, ApiError> {
let host = request
.headers()
@@ -317,15 +291,9 @@ async fn request_handler(
ws_connections.spawn(
async move {
if let Err(e) = websocket::serve_websocket(
config,
ctx,
websocket,
cancellation_handler,
endpoint_rate_limiter,
host,
)
.await
if let Err(e) =
websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
.await
{
error!("error in websocket connection: {e:#}");
}

View File

@@ -10,13 +10,11 @@ use crate::{
console::{
errors::{GetAuthInfoError, WakeComputeError},
locks::ApiLocks,
provider::ApiLockError,
CachedNodeInfo,
},
context::RequestMonitoring,
error::{ErrorKind, ReportableError, UserFacingError},
proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
rate_limiter::EndpointRateLimiter,
Host,
};
@@ -25,7 +23,6 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
pub struct PoolingBackend {
pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
pub config: &'static ProxyConfig,
pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
}
impl PoolingBackend {
@@ -41,12 +38,6 @@ impl PoolingBackend {
if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
}
if !self
.endpoint_rate_limiter
.check(conn_info.user_info.endpoint.clone().into(), 1)
{
return Err(AuthError::too_many_connections());
}
let cached_secret = match maybe_secret {
Some(secret) => secret,
None => backend.get_role_secret(ctx).await?,
@@ -140,8 +131,6 @@ pub enum HttpConnError {
AuthError(#[from] AuthError),
#[error("wake_compute returned error")]
WakeCompute(#[from] WakeComputeError),
#[error("error acquiring resource permit: {0}")]
TooManyConnectionAttempts(#[from] ApiLockError),
}
impl ReportableError for HttpConnError {
@@ -152,7 +141,6 @@ impl ReportableError for HttpConnError {
HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
HttpConnError::AuthError(a) => a.get_error_kind(),
HttpConnError::WakeCompute(w) => w.get_error_kind(),
HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(),
}
}
}
@@ -165,9 +153,6 @@ impl UserFacingError for HttpConnError {
HttpConnError::GetAuthInfo(c) => c.to_string_client(),
HttpConnError::AuthError(c) => c.to_string_client(),
HttpConnError::WakeCompute(c) => c.to_string_client(),
HttpConnError::TooManyConnectionAttempts(_) => {
"Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
}
}
}
}
@@ -180,15 +165,6 @@ impl ShouldRetry for HttpConnError {
HttpConnError::GetAuthInfo(_) => false,
HttpConnError::AuthError(_) => false,
HttpConnError::WakeCompute(_) => false,
HttpConnError::TooManyConnectionAttempts(_) => false,
}
}
fn should_retry_database_address(&self) -> bool {
match self {
HttpConnError::ConnectionError(e) => e.should_retry_database_address(),
// we never checked cache validity
HttpConnError::TooManyConnectionAttempts(_) => false,
_ => true,
}
}
}

View File

@@ -1,102 +0,0 @@
//! A set for cancelling random http connections
use std::{
hash::{BuildHasher, BuildHasherDefault},
num::NonZeroUsize,
time::Duration,
};
use indexmap::IndexMap;
use parking_lot::Mutex;
use rand::{thread_rng, Rng};
use rustc_hash::FxHasher;
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use uuid::Uuid;
type Hasher = BuildHasherDefault<FxHasher>;
pub struct CancelSet {
shards: Box<[Mutex<CancelShard>]>,
// keyed by random uuid, fxhasher is fine
hasher: Hasher,
}
pub struct CancelShard {
tokens: IndexMap<uuid::Uuid, (Instant, CancellationToken), Hasher>,
}
impl CancelSet {
pub fn new(shards: usize) -> Self {
CancelSet {
shards: (0..shards)
.map(|_| {
Mutex::new(CancelShard {
tokens: IndexMap::with_hasher(Hasher::default()),
})
})
.collect(),
hasher: Hasher::default(),
}
}
pub fn take(&self) -> Option<CancellationToken> {
for _ in 0..4 {
if let Some(token) = self.take_raw(thread_rng().gen()) {
return Some(token);
}
tracing::trace!("failed to get cancel token");
}
None
}
pub fn take_raw(&self, rng: usize) -> Option<CancellationToken> {
NonZeroUsize::new(self.shards.len())
.and_then(|len| self.shards[rng % len].lock().take(rng / len))
}
pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> {
let shard = NonZeroUsize::new(self.shards.len()).map(|len| {
let hash = self.hasher.hash_one(id) as usize;
let shard = &self.shards[hash % len];
shard.lock().insert(id, token);
shard
});
CancelGuard { shard, id }
}
}
impl CancelShard {
fn take(&mut self, rng: usize) -> Option<CancellationToken> {
NonZeroUsize::new(self.tokens.len()).and_then(|len| {
// 10 second grace period so we don't cancel new connections
if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) {
return None;
}
let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?;
Some(token)
})
}
fn remove(&mut self, id: uuid::Uuid) {
self.tokens.swap_remove(&id);
}
fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) {
self.tokens.insert(id, (Instant::now(), token));
}
}
pub struct CancelGuard<'a> {
shard: Option<&'a Mutex<CancelShard>>,
id: Uuid,
}
impl Drop for CancelGuard<'_> {
fn drop(&mut self) {
if let Some(shard) = self.shard {
shard.lock().remove(self.id);
}
}
}

Some files were not shown because too many files have changed in this diff Show More