mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-05 07:10:38 +00:00
Compare commits
52 Commits
fcdm/test-
...
skyzh/reve
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d33feb0c8e | ||
|
|
674bdcb7c6 | ||
|
|
a3b6c4b0c0 | ||
|
|
00c981576a | ||
|
|
c3f2240fbd | ||
|
|
ed5724d79d | ||
|
|
ca5390a89d | ||
|
|
3727c6fbbe | ||
|
|
42229aacf6 | ||
|
|
b7beaa0fd7 | ||
|
|
16c91ff5d3 | ||
|
|
078f941dc8 | ||
|
|
68bcbf8227 | ||
|
|
a31c95cb40 | ||
|
|
dc7eb5ae5a | ||
|
|
44fedfd6c3 | ||
|
|
138f008bab | ||
|
|
6a6f30e378 | ||
|
|
8f3bc5ae35 | ||
|
|
e6e578821b | ||
|
|
c32807ac19 | ||
|
|
50daff9655 | ||
|
|
bd845c7587 | ||
|
|
f63c8e5a8c | ||
|
|
200fa56b04 | ||
|
|
0f3dac265b | ||
|
|
1dc496a2c9 | ||
|
|
6814bdd30b | ||
|
|
0a667bc8ef | ||
|
|
f3acfb2d80 | ||
|
|
8c828c586e | ||
|
|
2334fed762 | ||
|
|
c53799044d | ||
|
|
e7477855b7 | ||
|
|
f4a668a27d | ||
|
|
970f2923b2 | ||
|
|
1678dea20f | ||
|
|
163f2eaf79 | ||
|
|
980d506bda | ||
|
|
d6c79b77df | ||
|
|
3350daeb9a | ||
|
|
939d50a41c | ||
|
|
2f9ada13c4 | ||
|
|
ff51b565d3 | ||
|
|
5e0409de95 | ||
|
|
4e3b70e308 | ||
|
|
61a65f61f3 | ||
|
|
d21246c8bd | ||
|
|
4825b0fec3 | ||
|
|
a4df3c8488 | ||
|
|
d95b46f3f3 | ||
|
|
85bef9f05d |
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -8,6 +8,8 @@ self-hosted-runner:
|
||||
- small-arm64
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- BENCHMARK_PROJECT_ID_PUB
|
||||
- BENCHMARK_PROJECT_ID_SUB
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
|
||||
@@ -19,6 +19,10 @@ on:
|
||||
description: 'debug or release'
|
||||
required: true
|
||||
type: string
|
||||
pg-versions:
|
||||
description: 'a json array of postgres versions to run regression tests on'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -254,7 +258,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pg_version: [ v14, v15, v16 ]
|
||||
pg_version: ${{ fromJson(inputs.pg-versions) }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -284,5 +288,5 @@ jobs:
|
||||
- name: Merge and upload coverage data
|
||||
if: |
|
||||
false &&
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v16'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
9
.github/workflows/benchmarking.yml
vendored
9
.github/workflows/benchmarking.yml
vendored
@@ -147,7 +147,7 @@ jobs:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
@@ -168,7 +168,7 @@ jobs:
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run benchmark
|
||||
- name: Run Logical Replication benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
@@ -176,12 +176,15 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
|
||||
BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}
|
||||
|
||||
- name: Run benchmark
|
||||
- name: Run Physical Replication benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
9
.github/workflows/build_and_test.yml
vendored
9
.github/workflows/build_and_test.yml
vendored
@@ -203,7 +203,8 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [ x64 ]
|
||||
build-type: [ debug, release ]
|
||||
# Do not build or run tests in debug for release branches
|
||||
build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
|
||||
include:
|
||||
- build-type: release
|
||||
arch: arm64
|
||||
@@ -213,6 +214,8 @@ jobs:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
build-tag: ${{ needs.tag.outputs.build-tag }}
|
||||
build-type: ${{ matrix.build-type }}
|
||||
# Run tests on all Postgres versions in release builds and only on the latest version in debug builds
|
||||
pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
|
||||
secrets: inherit
|
||||
|
||||
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
|
||||
@@ -306,7 +309,7 @@ jobs:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
create-test-report:
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
|
||||
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
|
||||
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
|
||||
outputs:
|
||||
report-url: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
@@ -868,7 +871,7 @@ jobs:
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
if: github.ref_name == 'main'
|
||||
|
||||
26
.github/workflows/pg-clients.yml
vendored
26
.github/workflows/pg-clients.yml
vendored
@@ -66,7 +66,31 @@ jobs:
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 8123:8123
|
||||
|
||||
zookeeper:
|
||||
image: quay.io/debezium/zookeeper:2.7
|
||||
ports:
|
||||
- 2181:2181
|
||||
kafka:
|
||||
image: quay.io/debezium/kafka:2.7
|
||||
env:
|
||||
ZOOKEEPER_CONNECT: "zookeeper:2181"
|
||||
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
|
||||
KAFKA_BROKER_ID: 1
|
||||
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
|
||||
KAFKA_JMX_PORT: 9991
|
||||
ports:
|
||||
- 9092:9092
|
||||
debezium:
|
||||
image: quay.io/debezium/connect:2.7
|
||||
env:
|
||||
BOOTSTRAP_SERVERS: kafka:9092
|
||||
GROUP_ID: 1
|
||||
CONFIG_STORAGE_TOPIC: debezium-config
|
||||
OFFSET_STORAGE_TOPIC: debezium-offset
|
||||
STATUS_STORAGE_TOPIC: debezium-status
|
||||
DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
|
||||
ports:
|
||||
- 8083:8083
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
|
||||
38
.github/workflows/trigger-e2e-tests.yml
vendored
38
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -13,8 +13,6 @@ defaults:
|
||||
env:
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
jobs:
|
||||
cancel-previous-e2e-tests:
|
||||
@@ -64,19 +62,35 @@ jobs:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
EVENT_ACTION: ${{ github.event.action }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
steps:
|
||||
- name: check if ecr image are present
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
- name: Wait for `promote-images` job to finish
|
||||
# It's important to have a timeout here, the script in the step can run infinitely
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||
if [ "$OUTPUT" == "" ]; then
|
||||
echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# For PRs we use the run id as the tag
|
||||
BUILD_AND_TEST_RUN_ID=${TAG}
|
||||
while true; do
|
||||
conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
|
||||
case "$conclusion" in
|
||||
success)
|
||||
break
|
||||
;;
|
||||
failure | cancelled | skipped)
|
||||
echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
echo "The 'promote-images' hasn't succeed yet. Waiting..."
|
||||
sleep 60
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
|
||||
186
Cargo.lock
generated
186
Cargo.lock
generated
@@ -1418,7 +1418,7 @@ dependencies = [
|
||||
"clap",
|
||||
"criterion-plot",
|
||||
"is-terminal",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"oorandom",
|
||||
@@ -1439,7 +1439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
|
||||
dependencies = [
|
||||
"cast",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2134,6 +2134,12 @@ dependencies = [
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gen_ops"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "304de19db7028420975a296ab0fcbbc8e69438c4ed254a1e41e2a7f37d5f0e0a"
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
@@ -2710,17 +2716,6 @@ version = "3.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "io-uring"
|
||||
version = "0.6.2"
|
||||
@@ -2739,14 +2734,13 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.7"
|
||||
version = "0.4.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
|
||||
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"io-lifetimes",
|
||||
"rustix 0.37.25",
|
||||
"windows-sys 0.48.0",
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2758,6 +2752,15 @@ dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.6"
|
||||
@@ -2872,18 +2875,6 @@ version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.13"
|
||||
@@ -3001,7 +2992,7 @@ checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"measured",
|
||||
"procfs 0.16.0",
|
||||
"procfs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3046,7 +3037,7 @@ dependencies = [
|
||||
"measured",
|
||||
"measured-process",
|
||||
"once_cell",
|
||||
"procfs 0.14.2",
|
||||
"procfs",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
@@ -3575,7 +3566,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"leaky-bucket",
|
||||
"md5",
|
||||
"metrics",
|
||||
@@ -3593,8 +3584,9 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"procfs 0.14.2",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
"range-set-blaze",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
@@ -3645,7 +3637,7 @@ dependencies = [
|
||||
"hex",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
@@ -3703,7 +3695,7 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
@@ -4035,7 +4027,7 @@ name = "postgres_connection"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"tokio-postgres",
|
||||
@@ -4093,7 +4085,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"rand 0.8.5",
|
||||
@@ -4139,21 +4131,6 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "procfs"
|
||||
version = "0.14.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"byteorder",
|
||||
"chrono",
|
||||
"flate2",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"rustix 0.36.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "procfs"
|
||||
version = "0.16.0"
|
||||
@@ -4161,10 +4138,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"chrono",
|
||||
"flate2",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"procfs-core",
|
||||
"rustix 0.38.28",
|
||||
"rustix",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4174,14 +4153,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"chrono",
|
||||
"hex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prometheus"
|
||||
version = "0.13.3"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
|
||||
checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fnv",
|
||||
@@ -4189,7 +4169,7 @@ dependencies = [
|
||||
"libc",
|
||||
"memchr",
|
||||
"parking_lot 0.12.1",
|
||||
"procfs 0.14.2",
|
||||
"procfs",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
@@ -4211,7 +4191,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck 0.4.1",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -4232,7 +4212,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
@@ -4289,7 +4269,7 @@ dependencies = [
|
||||
"hyper-util",
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"lasso",
|
||||
"md5",
|
||||
"measured",
|
||||
@@ -4465,6 +4445,18 @@ dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "range-set-blaze"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8421b5d459262eabbe49048d362897ff3e3830b44eac6cfe341d6acb2f0f13d2"
|
||||
dependencies = [
|
||||
"gen_ops",
|
||||
"itertools 0.12.1",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.7.0"
|
||||
@@ -4633,7 +4625,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper 0.14.26",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
@@ -4943,34 +4935,6 @@ dependencies = [
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"errno",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys 0.1.4",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.37.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"errno",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys 0.3.8",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.28"
|
||||
@@ -5730,7 +5694,7 @@ dependencies = [
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"lasso",
|
||||
"measured",
|
||||
"metrics",
|
||||
@@ -5739,6 +5703,7 @@ dependencies = [
|
||||
"pageserver_client",
|
||||
"postgres_connection",
|
||||
"r2d2",
|
||||
"rand 0.8.5",
|
||||
"reqwest 0.12.4",
|
||||
"routerify",
|
||||
"scopeguard",
|
||||
@@ -5794,9 +5759,10 @@ dependencies = [
|
||||
"either",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"once_cell",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
@@ -5973,15 +5939,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.5.0"
|
||||
version = "3.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998"
|
||||
checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand 1.9.0",
|
||||
"redox_syscall 0.3.5",
|
||||
"rustix 0.37.25",
|
||||
"windows-sys 0.45.0",
|
||||
"fastrand 2.0.0",
|
||||
"redox_syscall 0.4.1",
|
||||
"rustix",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -7178,15 +7144,6 @@ dependencies = [
|
||||
"windows_x86_64_msvc 0.42.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.45.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
|
||||
dependencies = [
|
||||
"windows-targets 0.42.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
@@ -7205,21 +7162,6 @@ dependencies = [
|
||||
"windows-targets 0.52.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm 0.42.2",
|
||||
"windows_aarch64_msvc 0.42.2",
|
||||
"windows_i686_gnu 0.42.2",
|
||||
"windows_i686_msvc 0.42.2",
|
||||
"windows_x86_64_gnu 0.42.2",
|
||||
"windows_x86_64_gnullvm 0.42.2",
|
||||
"windows_x86_64_msvc 0.42.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.48.0"
|
||||
@@ -7449,7 +7391,7 @@ dependencies = [
|
||||
"hmac",
|
||||
"hyper 0.14.26",
|
||||
"indexmap 1.9.3",
|
||||
"itertools",
|
||||
"itertools 0.10.5",
|
||||
"libc",
|
||||
"log",
|
||||
"memchr",
|
||||
|
||||
@@ -126,7 +126,7 @@ parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "51.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
procfs = "0.14"
|
||||
procfs = "0.16"
|
||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
|
||||
@@ -933,7 +933,8 @@ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
|
||||
COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
|
||||
COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
|
||||
#COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
COPY --from=rum-pg-build /rum.tar.gz /ext-src
|
||||
COPY patches/rum.patch /ext-src
|
||||
#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
|
||||
COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
|
||||
COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
|
||||
@@ -945,7 +946,7 @@ COPY patches/pg_hintplan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
#COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
|
||||
COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
|
||||
COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
|
||||
COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
|
||||
@@ -960,6 +961,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
|
||||
rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
|
||||
|| exit 1; rm -f $f; done
|
||||
RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||
|
||||
@@ -4,6 +4,11 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test specific features.
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
|
||||
@@ -400,7 +400,15 @@ impl ComputeNode {
|
||||
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let mut retry_period_ms = 500.0;
|
||||
let mut attempts = 0;
|
||||
let max_attempts = 10;
|
||||
const DEFAULT_ATTEMPTS: u16 = 10;
|
||||
#[cfg(feature = "testing")]
|
||||
let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
|
||||
u16::from_str(&v).unwrap()
|
||||
} else {
|
||||
DEFAULT_ATTEMPTS
|
||||
};
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let max_attempts = DEFAULT_ATTEMPTS;
|
||||
loop {
|
||||
let result = self.try_get_basebackup(compute_state, lsn);
|
||||
match result {
|
||||
|
||||
@@ -289,7 +289,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
|
||||
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
||||
for (var, val) in std::env::vars() {
|
||||
if var.starts_with("NEON_PAGESERVER_") {
|
||||
if var.starts_with("NEON_") {
|
||||
cmd = cmd.env(var, val);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
|
||||
rm -rf $TMPDIR
|
||||
# We are running tests now
|
||||
if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
|
||||
then
|
||||
cleanup
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
cd /ext-src
|
||||
cd /ext-src || exit 2
|
||||
FAILED=
|
||||
LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
|
||||
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d ${d} ] || continue
|
||||
[ -d "${d}" ] || continue
|
||||
psql -c "select 1" >/dev/null || break
|
||||
make -C ${d} installcheck || FAILED="${d} ${FAILED}"
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
echo ${FAILED}
|
||||
echo "${FAILED}"
|
||||
exit 1
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::collections::HashSet;
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -304,8 +305,8 @@ pub struct MetadataHealthRecord {
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MetadataHealthUpdateRequest {
|
||||
pub healthy_tenant_shards: Vec<TenantShardId>,
|
||||
pub unhealthy_tenant_shards: Vec<TenantShardId>,
|
||||
pub healthy_tenant_shards: HashSet<TenantShardId>,
|
||||
pub unhealthy_tenant_shards: HashSet<TenantShardId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
|
||||
@@ -637,6 +637,13 @@ pub struct TenantInfo {
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
pub generation: u32,
|
||||
|
||||
/// Opaque explanation if gc is being blocked.
|
||||
///
|
||||
/// Only looked up for the individual tenant detail, not the listing. This is purely for
|
||||
/// debugging, not included in openapi.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub gc_blocking: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
@@ -1427,6 +1434,7 @@ mod tests {
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: 1,
|
||||
gc_blocking: None,
|
||||
};
|
||||
let expected_active = json!({
|
||||
"id": original_active.id.to_string(),
|
||||
@@ -1449,6 +1457,7 @@ mod tests {
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: 1,
|
||||
gc_blocking: None,
|
||||
};
|
||||
let expected_broken = json!({
|
||||
"id": original_broken.id.to_string(),
|
||||
|
||||
@@ -144,6 +144,7 @@ impl RemotePath {
|
||||
///
|
||||
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
|
||||
/// NoDelimiter mode will only populate `keys`.
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum ListingMode {
|
||||
WithDelimiter,
|
||||
NoDelimiter,
|
||||
|
||||
@@ -49,6 +49,7 @@ postgres_backend.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -107,3 +108,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "bench_walredo"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench_ingest"
|
||||
harness = false
|
||||
|
||||
235
pageserver/benches/bench_ingest.rs
Normal file
235
pageserver/benches/bench_ingest.rs
Normal file
@@ -0,0 +1,235 @@
|
||||
use std::{env, num::NonZeroUsize};
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file::{self, api::IoEngineKind},
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
// A very cheap hash for generating non-sequential keys.
|
||||
fn murmurhash32(mut h: u32) -> u32 {
|
||||
h ^= h >> 16;
|
||||
h = h.wrapping_mul(0x85ebca6b);
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(0xc2b2ae35);
|
||||
h ^= h >> 16;
|
||||
h
|
||||
}
|
||||
|
||||
enum KeyLayout {
|
||||
/// Sequential unique keys
|
||||
Sequential,
|
||||
/// Random unique keys
|
||||
Random,
|
||||
/// Random keys, but only use the bits from the mask of them
|
||||
RandomReuse(u32),
|
||||
}
|
||||
|
||||
enum WriteDelta {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
async fn ingest(
|
||||
conf: &'static PageServerConf,
|
||||
put_size: usize,
|
||||
put_count: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut lsn = utils::lsn::Lsn(1000);
|
||||
let mut key = Key::from_i128(0x0);
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &ctx).await?;
|
||||
|
||||
let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
|
||||
let ctx = RequestContext::new(
|
||||
pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
|
||||
pageserver::context::DownloadBehavior::Download,
|
||||
);
|
||||
|
||||
for i in 0..put_count {
|
||||
lsn += put_size as u64;
|
||||
|
||||
// Generate lots of keys within a single relation, which simulates the typical bulk ingest case: people
|
||||
// usually care the most about write performance when they're blasting a huge batch of data into a huge table.
|
||||
match key_layout {
|
||||
KeyLayout::Sequential => {
|
||||
// Use sequential order to illustrate the experience a user is likely to have
|
||||
// when ingesting bulk data.
|
||||
key.field6 = i as u32;
|
||||
}
|
||||
KeyLayout::Random => {
|
||||
// Use random-order keys to avoid giving a false advantage to data structures that are
|
||||
// faster when inserting on the end.
|
||||
key.field6 = murmurhash32(i as u32);
|
||||
}
|
||||
KeyLayout::RandomReuse(mask) => {
|
||||
// Use low bits only, to limit cardinality
|
||||
key.field6 = murmurhash32(i as u32) & mask;
|
||||
}
|
||||
}
|
||||
|
||||
layer.put_value(key, lsn, &data, &ctx).await?;
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
|
||||
if matches!(write_delta, WriteDelta::Yes) {
|
||||
let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
|
||||
max_concurrency: NonZeroUsize::new(1).unwrap(),
|
||||
});
|
||||
let (_desc, path) = layer
|
||||
.write_to_disk(&ctx, None, l0_flush_state.inner())
|
||||
.await?
|
||||
.unwrap();
|
||||
tokio::fs::remove_file(path).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Wrapper to instantiate a tokio runtime
|
||||
fn ingest_main(
|
||||
conf: &'static PageServerConf,
|
||||
put_size: usize,
|
||||
put_count: usize,
|
||||
key_layout: KeyLayout,
|
||||
write_delta: WriteDelta,
|
||||
) {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
runtime.block_on(async move {
|
||||
let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
|
||||
if let Err(e) = r {
|
||||
panic!("{e:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Declare a series of benchmarks for the Pageserver's ingest write path.
|
||||
///
|
||||
/// This benchmark does not include WAL decode: it starts at InMemoryLayer::put_value, and ends either
|
||||
/// at freezing the ephemeral layer, or writing the ephemeral layer out to an L0 (depending on whether WriteDelta is set).
|
||||
///
|
||||
/// Genuine disk I/O is used, so expect results to differ depending on storage. However, when running on
|
||||
/// a fast disk, CPU is the bottleneck at time of writing.
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into().unwrap();
|
||||
let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent).unwrap();
|
||||
eprintln!("Data directory: {}", temp_dir.path());
|
||||
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||
));
|
||||
virtual_file::init(16384, IoEngineKind::TokioEpollUring);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
{
|
||||
let mut group = c.benchmark_group("ingest-small-values");
|
||||
let put_size = 100usize;
|
||||
let put_count = 128 * 1024 * 1024 / put_size;
|
||||
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
|
||||
group.sample_size(10);
|
||||
group.bench_function("ingest 128MB/100b seq", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b rand", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Random,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::RandomReuse(0x3ff),
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/100b seq, no delta", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::No,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
let mut group = c.benchmark_group("ingest-big-values");
|
||||
let put_size = 8192usize;
|
||||
let put_count = 128 * 1024 * 1024 / put_size;
|
||||
group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
|
||||
group.sample_size(10);
|
||||
group.bench_function("ingest 128MB/8k seq", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::Yes,
|
||||
)
|
||||
})
|
||||
});
|
||||
group.bench_function("ingest 128MB/8k seq, no delta", |b| {
|
||||
b.iter(|| {
|
||||
ingest_main(
|
||||
conf,
|
||||
put_size,
|
||||
put_count,
|
||||
KeyLayout::Sequential,
|
||||
WriteDelta::No,
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_main!(benches);
|
||||
@@ -1,3 +1,4 @@
|
||||
use criterion::measurement::WallTime;
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
@@ -15,7 +16,11 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
use criterion::{black_box, criterion_group, criterion_main, BenchmarkGroup, Criterion};
|
||||
|
||||
fn fixture_path(relative: &str) -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
|
||||
}
|
||||
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
@@ -109,7 +114,7 @@ fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning
|
||||
// between each test run.
|
||||
fn bench_from_captest_env(c: &mut Criterion) {
|
||||
// TODO consider compressing this file
|
||||
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
||||
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
|
||||
|
||||
// Test with uniform query pattern
|
||||
@@ -139,7 +144,7 @@ fn bench_from_captest_env(c: &mut Criterion) {
|
||||
fn bench_from_real_project(c: &mut Criterion) {
|
||||
// Init layer map
|
||||
let now = Instant::now();
|
||||
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
||||
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
// Choose uniformly distributed queries
|
||||
@@ -242,7 +247,72 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_visibility_with_map(
|
||||
group: &mut BenchmarkGroup<WallTime>,
|
||||
layer_map: LayerMap,
|
||||
read_points: Vec<Lsn>,
|
||||
bench_name: &str,
|
||||
) {
|
||||
group.bench_function(bench_name, |b| {
|
||||
b.iter(|| black_box(layer_map.get_visibility(read_points.clone())));
|
||||
});
|
||||
}
|
||||
|
||||
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
|
||||
fn bench_visibility(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("visibility");
|
||||
{
|
||||
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
|
||||
let now = Instant::now();
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut updates = layer_map.batch_update();
|
||||
for i in 0..100_000 {
|
||||
let i32 = (i as u32) % 100;
|
||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||
let layer = PersistentLayerDesc::new_img(
|
||||
TenantShardId::unsharded(TenantId::generate()),
|
||||
TimelineId::generate(),
|
||||
zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
Lsn(i),
|
||||
0,
|
||||
);
|
||||
updates.insert_historic(layer);
|
||||
}
|
||||
updates.flush();
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
let mut read_points = Vec::new();
|
||||
for i in (0..100_000).step_by(1000) {
|
||||
read_points.push(Lsn(i));
|
||||
}
|
||||
|
||||
bench_visibility_with_map(&mut group, layer_map, read_points, "sequential");
|
||||
}
|
||||
|
||||
{
|
||||
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||
let read_points = vec![Lsn(0x1C760FA190)];
|
||||
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map");
|
||||
|
||||
let layer_map = build_layer_map(fixture_path("benches/odd-brook-layernames.txt"));
|
||||
let read_points = vec![
|
||||
Lsn(0x1C760FA190),
|
||||
Lsn(0x000000931BEAD539),
|
||||
Lsn(0x000000931BF63011),
|
||||
Lsn(0x000000931B33AE68),
|
||||
Lsn(0x00000038E67ABFA0),
|
||||
Lsn(0x000000931B33AE68),
|
||||
Lsn(0x000000914E3F38F0),
|
||||
Lsn(0x000000931B33AE68),
|
||||
];
|
||||
bench_visibility_with_map(&mut group, layer_map, read_points, "real_map_many_branches");
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(group_1, bench_from_captest_env);
|
||||
criterion_group!(group_2, bench_from_real_project);
|
||||
criterion_group!(group_3, bench_sequential);
|
||||
criterion_main!(group_1, group_2, group_3);
|
||||
criterion_group!(group_4, bench_visibility);
|
||||
criterion_main!(group_1, group_2, group_3, group_4);
|
||||
|
||||
@@ -17,11 +17,9 @@ use pageserver::config::PageserverIdentity;
|
||||
use pageserver::control_plane_client::ControlPlaneClient;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
|
||||
use pageserver::tenant::{secondary, TenantSharedResources};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
|
||||
};
|
||||
use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
@@ -31,11 +29,9 @@ use tracing::*;
|
||||
use metrics::set_build_info_metric;
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
deletion_queue::DeletionQueue,
|
||||
http, page_cache, page_service, task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||
task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
|
||||
tenant::mgr,
|
||||
virtual_file,
|
||||
};
|
||||
@@ -593,30 +589,13 @@ fn start_pageserver(
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection. We created the listener earlier already.
|
||||
let libpq_listener = {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
TaskKind::LibpqEndpointListener,
|
||||
// listener task shouldn't need to download anything. (We will
|
||||
// create a separate sub-contexts for each connection, with their
|
||||
// own download behavior. This context is used only to listen and
|
||||
// accept connections.)
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
|
||||
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"libpq listener",
|
||||
page_service::libpq_listener_main(
|
||||
tenant_manager.clone(),
|
||||
pg_auth,
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
),
|
||||
));
|
||||
LibpqEndpointListener(CancellableTask { task, cancel })
|
||||
};
|
||||
let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
|
||||
let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
|
||||
pageserver_listener
|
||||
.set_nonblocking(true)
|
||||
.context("set listener to nonblocking")?;
|
||||
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
|
||||
});
|
||||
|
||||
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
|
||||
|
||||
@@ -644,7 +623,7 @@ fn start_pageserver(
|
||||
shutdown_pageserver.take();
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
libpq_listener,
|
||||
page_service,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
&tenant_manager,
|
||||
|
||||
@@ -308,6 +308,45 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/block_gc:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Persistently add a gc blocking at the tenant level because of this timeline
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/unblock_gc:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Persistently remove a tenant level gc blocking for this timeline
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
|
||||
@@ -296,6 +296,11 @@ impl From<GetActiveTenantError> for ApiError {
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => {
|
||||
ApiError::ResourceUnavailable(format!("{}", e).into())
|
||||
}
|
||||
GetActiveTenantError::SwitchedTenant => {
|
||||
// in our HTTP handlers, this error doesn't happen
|
||||
// TODO: separate error types
|
||||
ApiError::ResourceUnavailable("switched tenant".into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -930,6 +935,7 @@ async fn tenant_list_handler(
|
||||
generation: (*gen)
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
gc_blocking: None,
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
|
||||
@@ -981,6 +987,7 @@ async fn tenant_status(
|
||||
.generation()
|
||||
.into()
|
||||
.expect("Tenants are always attached with a generation"),
|
||||
gc_blocking: tenant.gc_block.summary().map(|x| format!("{x:?}")),
|
||||
},
|
||||
walredo: tenant.wal_redo_manager_status(),
|
||||
timelines: tenant.list_timeline_ids(),
|
||||
@@ -1221,6 +1228,72 @@ async fn evict_timeline_layer_handler(
|
||||
}
|
||||
}
|
||||
|
||||
async fn timeline_gc_blocking_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
block_or_unblock_gc(request, true).await
|
||||
}
|
||||
|
||||
async fn timeline_gc_unblocking_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
block_or_unblock_gc(request, false).await
|
||||
}
|
||||
|
||||
/// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
|
||||
///
|
||||
/// Both are technically unsafe because they might fire off index uploads, thus they are POST.
|
||||
async fn block_or_unblock_gc(
|
||||
request: Request<Body>,
|
||||
block: bool,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
use crate::tenant::{
|
||||
remote_timeline_client::WaitCompletionError, upload_queue::NotInitialized,
|
||||
};
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let state = get_state(&request);
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let fut = async {
|
||||
if block {
|
||||
timeline.block_gc(&tenant).await.map(|_| ())
|
||||
} else {
|
||||
timeline.unblock_gc(&tenant).await
|
||||
}
|
||||
};
|
||||
|
||||
let span = tracing::info_span!(
|
||||
"block_or_unblock_gc",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
timeline_id = %timeline_id,
|
||||
block = block,
|
||||
);
|
||||
|
||||
let res = fut.instrument(span).await;
|
||||
|
||||
res.map_err(|e| {
|
||||
if e.is::<NotInitialized>() || e.is::<WaitCompletionError>() {
|
||||
ApiError::ShuttingDown
|
||||
} else {
|
||||
ApiError::InternalServerError(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Get tenant_size SVG graph along with the JSON data.
|
||||
fn synthetic_size_html_response(
|
||||
inputs: ModelInputs,
|
||||
@@ -2899,6 +2972,14 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| api_handler(r, evict_timeline_layer_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
|
||||
|r| api_handler(r, timeline_gc_blocking_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
|
||||
|r| api_handler(r, timeline_gc_unblocking_handler),
|
||||
)
|
||||
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
|
||||
api_handler(r, secondary_upload_handler)
|
||||
})
|
||||
|
||||
@@ -24,7 +24,7 @@ impl Default for L0FlushConfig {
|
||||
#[derive(Clone)]
|
||||
pub struct L0FlushGlobalState(Arc<Inner>);
|
||||
|
||||
pub(crate) enum Inner {
|
||||
pub enum Inner {
|
||||
PageCached,
|
||||
Direct { semaphore: tokio::sync::Semaphore },
|
||||
}
|
||||
@@ -40,7 +40,7 @@ impl L0FlushGlobalState {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn inner(&self) -> &Arc<Inner> {
|
||||
pub fn inner(&self) -> &Arc<Inner> {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,8 @@ pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod l0_flush;
|
||||
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
pub use pageserver_api::keyspace;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
pub mod aux_file;
|
||||
@@ -30,14 +32,13 @@ pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use tenant::{
|
||||
mgr::{BackgroundPurges, TenantManager},
|
||||
secondary,
|
||||
};
|
||||
use tracing::info;
|
||||
use tracing::{info, info_span};
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
@@ -63,7 +64,6 @@ pub struct CancellableTask {
|
||||
pub cancel: CancellationToken,
|
||||
}
|
||||
pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct LibpqEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
impl CancellableTask {
|
||||
@@ -77,7 +77,7 @@ impl CancellableTask {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn shutdown_pageserver(
|
||||
http_listener: HttpEndpointListener,
|
||||
libpq_listener: LibpqEndpointListener,
|
||||
page_service: page_service::Listener,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
tenant_manager: &TenantManager,
|
||||
@@ -87,10 +87,83 @@ pub async fn shutdown_pageserver(
|
||||
exit_code: i32,
|
||||
) {
|
||||
use std::time::Duration;
|
||||
|
||||
// If the orderly shutdown below takes too long, we still want to make
|
||||
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||
//
|
||||
// (Leftover walredo processes are the hypothesized trigger for the systemd freezes
|
||||
// that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
|
||||
//
|
||||
// We use a thread instead of a tokio task because the background runtime is likely busy
|
||||
// with the final flushing / uploads. This activity here has priority, and due to lack
|
||||
// of scheduling priority feature sin the tokio scheduler, using a separate thread is
|
||||
// an effective priority booster.
|
||||
let walredo_extraordinary_shutdown_thread_span = {
|
||||
let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
|
||||
span.follows_from(tracing::Span::current());
|
||||
span
|
||||
};
|
||||
let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
|
||||
let walredo_extraordinary_shutdown_thread = std::thread::spawn({
|
||||
let walredo_extraordinary_shutdown_thread_cancel =
|
||||
walredo_extraordinary_shutdown_thread_cancel.clone();
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let _entered = rt.enter();
|
||||
let _entered = walredo_extraordinary_shutdown_thread_span.enter();
|
||||
if let Ok(()) = rt.block_on(tokio::time::timeout(
|
||||
Duration::from_secs(8),
|
||||
walredo_extraordinary_shutdown_thread_cancel.cancelled(),
|
||||
)) {
|
||||
info!("cancellation requested");
|
||||
return;
|
||||
}
|
||||
let managers = tenant::WALREDO_MANAGERS
|
||||
.lock()
|
||||
.unwrap()
|
||||
// prevents new walredo managers from being inserted
|
||||
.take()
|
||||
.expect("only we take()");
|
||||
// Use FuturesUnordered to get in queue early for each manager's
|
||||
// heavier_once_cell semaphore wait list.
|
||||
// Also, for idle tenants that for some reason haven't
|
||||
// shut down yet, it's quite likely that we're not going
|
||||
// to get Poll::Pending once.
|
||||
let mut futs: FuturesUnordered<_> = managers
|
||||
.into_iter()
|
||||
.filter_map(|(_, mgr)| mgr.upgrade())
|
||||
.map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
|
||||
.collect();
|
||||
info!(count=%futs.len(), "built FuturesUnordered");
|
||||
let mut last_log_at = std::time::Instant::now();
|
||||
#[derive(Debug, Default)]
|
||||
struct Results {
|
||||
initiated: u64,
|
||||
already: u64,
|
||||
}
|
||||
let mut results = Results::default();
|
||||
while let Some(we_initiated) = rt.block_on(futs.next()) {
|
||||
if we_initiated {
|
||||
results.initiated += 1;
|
||||
} else {
|
||||
results.already += 1;
|
||||
}
|
||||
if last_log_at.elapsed() > Duration::from_millis(100) {
|
||||
info!(remaining=%futs.len(), ?results, "progress");
|
||||
last_log_at = std::time::Instant::now();
|
||||
}
|
||||
}
|
||||
info!(?results, "done");
|
||||
}
|
||||
});
|
||||
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
timed(
|
||||
libpq_listener.0.shutdown(),
|
||||
let remaining_connections = timed(
|
||||
page_service.stop_accepting(),
|
||||
"shutdown LibpqEndpointListener",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
@@ -108,7 +181,7 @@ pub async fn shutdown_pageserver(
|
||||
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||
// should already have been canclled via mgr::shutdown_all_tenants
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
remaining_connections.shutdown(),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
@@ -162,6 +235,12 @@ pub async fn shutdown_pageserver(
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
info!("cancel & join walredo_extraordinary_shutdown_thread");
|
||||
walredo_extraordinary_shutdown_thread_cancel.cancel();
|
||||
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||
info!("walredo_extraordinary_shutdown_thread done");
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
@@ -525,6 +525,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_visible_physical_size",
|
||||
"The size of the layer files present in the pageserver's filesystem.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_resident_physical_size_global",
|
||||
@@ -2204,6 +2213,7 @@ pub(crate) struct TimelineMetrics {
|
||||
pub(crate) layer_count_delta: UIntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub visible_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub aux_file_size_gauge: IntGauge,
|
||||
@@ -2326,6 +2336,9 @@ impl TimelineMetrics {
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
// TODO: we shouldn't expose this metric
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
@@ -2380,6 +2393,7 @@ impl TimelineMetrics {
|
||||
layer_count_delta,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
visible_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
aux_file_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
@@ -2431,6 +2445,7 @@ impl TimelineMetrics {
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -33,6 +33,7 @@ use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::sync::Weak;
|
||||
use std::time::SystemTime;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::io::BufReader;
|
||||
@@ -147,6 +148,7 @@ pub(crate) mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
mod gc_block;
|
||||
pub(crate) mod throttle;
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -302,6 +304,12 @@ pub struct Tenant {
|
||||
/// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
|
||||
ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
|
||||
|
||||
/// `index_part.json` based gc blocking reason tracking.
|
||||
///
|
||||
/// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
|
||||
/// proceeding.
|
||||
pub(crate) gc_block: gc_block::GcBlock,
|
||||
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
}
|
||||
|
||||
@@ -312,14 +320,66 @@ impl std::fmt::Debug for Tenant {
|
||||
}
|
||||
|
||||
pub(crate) enum WalRedoManager {
|
||||
Prod(PostgresRedoManager),
|
||||
Prod(WalredoManagerId, PostgresRedoManager),
|
||||
#[cfg(test)]
|
||||
Test(harness::TestRedoManager),
|
||||
}
|
||||
|
||||
impl From<PostgresRedoManager> for WalRedoManager {
|
||||
fn from(mgr: PostgresRedoManager) -> Self {
|
||||
Self::Prod(mgr)
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("pageserver is shutting down")]
|
||||
pub(crate) struct GlobalShutDown;
|
||||
|
||||
impl WalRedoManager {
|
||||
pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
|
||||
let id = WalredoManagerId::next();
|
||||
let arc = Arc::new(Self::Prod(id, mgr));
|
||||
let mut guard = WALREDO_MANAGERS.lock().unwrap();
|
||||
match &mut *guard {
|
||||
Some(map) => {
|
||||
map.insert(id, Arc::downgrade(&arc));
|
||||
Ok(arc)
|
||||
}
|
||||
None => Err(GlobalShutDown),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoManager {
|
||||
fn drop(&mut self) {
|
||||
match self {
|
||||
Self::Prod(id, _) => {
|
||||
let mut guard = WALREDO_MANAGERS.lock().unwrap();
|
||||
if let Some(map) = &mut *guard {
|
||||
map.remove(id).expect("new() registers, drop() unregisters");
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
|
||||
/// the walredo processes outside of the regular order.
|
||||
///
|
||||
/// This is necessary to work around a systemd bug where it freezes if there are
|
||||
/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
|
||||
Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
|
||||
> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
|
||||
#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
|
||||
pub(crate) struct WalredoManagerId(u64);
|
||||
impl WalredoManagerId {
|
||||
pub fn next() -> Self {
|
||||
static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
|
||||
let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if id == 0 {
|
||||
panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
|
||||
}
|
||||
Self(id)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -331,19 +391,20 @@ impl From<harness::TestRedoManager> for WalRedoManager {
|
||||
}
|
||||
|
||||
impl WalRedoManager {
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
pub(crate) async fn shutdown(&self) -> bool {
|
||||
match self {
|
||||
Self::Prod(mgr) => mgr.shutdown().await,
|
||||
Self::Prod(_, mgr) => mgr.shutdown().await,
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
||||
match self {
|
||||
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
|
||||
Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
@@ -363,7 +424,7 @@ impl WalRedoManager {
|
||||
pg_version: u32,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
Self::Prod(mgr) => {
|
||||
Self::Prod(_, mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
}
|
||||
@@ -377,7 +438,7 @@ impl WalRedoManager {
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
match self {
|
||||
WalRedoManager::Prod(m) => Some(m.status()),
|
||||
WalRedoManager::Prod(_, m) => Some(m.status()),
|
||||
#[cfg(test)]
|
||||
WalRedoManager::Test(_) => None,
|
||||
}
|
||||
@@ -386,6 +447,8 @@ impl WalRedoManager {
|
||||
|
||||
#[derive(Debug, thiserror::Error, PartialEq, Eq)]
|
||||
pub enum GetTimelineError {
|
||||
#[error("Timeline is shutting down")]
|
||||
ShuttingDown,
|
||||
#[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
|
||||
NotActive {
|
||||
tenant_id: TenantShardId,
|
||||
@@ -675,11 +738,9 @@ impl Tenant {
|
||||
init_order: Option<InitializationOrder>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Arc<Tenant> {
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
)));
|
||||
) -> Result<Arc<Tenant>, GlobalShutDown> {
|
||||
let wal_redo_manager =
|
||||
WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
|
||||
|
||||
let TenantSharedResources {
|
||||
broker_client,
|
||||
@@ -878,7 +939,7 @@ impl Tenant {
|
||||
}
|
||||
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
|
||||
);
|
||||
tenant
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
@@ -982,6 +1043,8 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let mut gc_blocks = HashMap::new();
|
||||
|
||||
// For every timeline, download the metadata file, scan the local directory,
|
||||
// and build a layer map that contains an entry for each remote and local
|
||||
// layer file.
|
||||
@@ -991,6 +1054,16 @@ impl Tenant {
|
||||
.remove(&timeline_id)
|
||||
.expect("just put it in above");
|
||||
|
||||
if let Some(blocking) = index_part.gc_blocking.as_ref() {
|
||||
// could just filter these away, but it helps while testing
|
||||
anyhow::ensure!(
|
||||
!blocking.reasons.is_empty(),
|
||||
"index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
|
||||
);
|
||||
let prev = gc_blocks.insert(timeline_id, blocking.reasons);
|
||||
assert!(prev.is_none());
|
||||
}
|
||||
|
||||
// TODO again handle early failure
|
||||
self.load_remote_timeline(
|
||||
timeline_id,
|
||||
@@ -1035,6 +1108,8 @@ impl Tenant {
|
||||
// IndexPart is the source of truth.
|
||||
self.clean_up_timelines(&existent_timelines)?;
|
||||
|
||||
self.gc_block.set_scanned(gc_blocks);
|
||||
|
||||
fail::fail_point!("attach-before-activate", |_| {
|
||||
anyhow::bail!("attach-before-activate");
|
||||
});
|
||||
@@ -1580,7 +1655,7 @@ impl Tenant {
|
||||
self: Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
DeleteTimelineFlow::run(&self, timeline_id, false).await?;
|
||||
DeleteTimelineFlow::run(&self, timeline_id).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1625,6 +1700,14 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let _guard = match self.gc_block.start().await {
|
||||
Ok(guard) => guard,
|
||||
Err(reasons) => {
|
||||
info!("Skipping GC: {reasons}");
|
||||
return Ok(GcResult::default());
|
||||
}
|
||||
};
|
||||
|
||||
self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
|
||||
.await
|
||||
}
|
||||
@@ -2637,6 +2720,7 @@ impl Tenant {
|
||||
)),
|
||||
tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
gc_block: Default::default(),
|
||||
l0_flush_global_state,
|
||||
}
|
||||
}
|
||||
@@ -4038,7 +4122,7 @@ pub(crate) mod harness {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
@@ -4713,7 +4797,7 @@ mod tests {
|
||||
lsn: Lsn,
|
||||
repeat: usize,
|
||||
key_count: usize,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
|
||||
let compact = true;
|
||||
bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
|
||||
}
|
||||
@@ -4726,7 +4810,9 @@ mod tests {
|
||||
repeat: usize,
|
||||
key_count: usize,
|
||||
compact: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
|
||||
let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();
|
||||
|
||||
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let mut blknum = 0;
|
||||
|
||||
@@ -4747,6 +4833,7 @@ mod tests {
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
inserted.entry(test_key).or_default().insert(lsn);
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
|
||||
@@ -4771,7 +4858,7 @@ mod tests {
|
||||
assert_eq!(res.layers_removed, 0, "this never removes anything");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(inserted)
|
||||
}
|
||||
|
||||
//
|
||||
@@ -4818,7 +4905,7 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
|
||||
let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
let guard = tline.layers.read().await;
|
||||
guard.layer_map().dump(true, &ctx).await?;
|
||||
@@ -4879,9 +4966,39 @@ mod tests {
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
tline
|
||||
.validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
|
||||
.await;
|
||||
|
||||
let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
|
||||
let mut expect_missing = false;
|
||||
let mut key = read.start().unwrap();
|
||||
while key != read.end().unwrap() {
|
||||
if let Some(lsns) = inserted.get(&key) {
|
||||
let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
|
||||
match expected_lsn {
|
||||
Some(lsn) => {
|
||||
expected_lsns.insert(key, *lsn);
|
||||
}
|
||||
None => {
|
||||
expect_missing = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
expect_missing = true;
|
||||
break;
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
if expect_missing {
|
||||
assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
|
||||
} else {
|
||||
for (key, image) in vectored_res? {
|
||||
let expected_lsn = expected_lsns.get(&key).expect("determined above");
|
||||
let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
|
||||
assert_eq!(image?, expected_image);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -4931,10 +5048,6 @@ mod tests {
|
||||
)
|
||||
.await;
|
||||
|
||||
child_timeline
|
||||
.validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
|
||||
.await;
|
||||
|
||||
let images = vectored_res?;
|
||||
assert!(images.is_empty());
|
||||
Ok(())
|
||||
@@ -6845,7 +6958,10 @@ mod tests {
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for (idx, expected) in expected_result.iter().enumerate() {
|
||||
assert_eq!(
|
||||
@@ -6909,7 +7025,11 @@ mod tests {
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
key_range: {
|
||||
let mut key = Key::MAX;
|
||||
key.field6 -= 1;
|
||||
Key::MIN..key
|
||||
},
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
@@ -6928,6 +7048,18 @@ mod tests {
|
||||
]
|
||||
);
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
guard.cutoffs.space = Lsn(0x40);
|
||||
}
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -7260,7 +7392,10 @@ mod tests {
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
@@ -7279,6 +7414,18 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
guard.cutoffs.space = Lsn(0x40);
|
||||
}
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -7347,6 +7494,7 @@ mod tests {
|
||||
Lsn(0x60),
|
||||
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
|
||||
3,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -7471,7 +7619,7 @@ mod tests {
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
|
||||
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
@@ -7517,6 +7665,114 @@ mod tests {
|
||||
};
|
||||
assert_eq!(res, expected_res);
|
||||
|
||||
// In case of branch compaction, the branch itself does not have the full history, and we need to provide
|
||||
// the ancestor image in the test case.
|
||||
|
||||
let history = vec![
|
||||
(
|
||||
key,
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(
|
||||
key,
|
||||
&history,
|
||||
Lsn(0x60),
|
||||
&[],
|
||||
3,
|
||||
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
below_horizon: vec![(
|
||||
Lsn(0x60),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x60),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
|
||||
)]),
|
||||
)],
|
||||
above_horizon: KeyLogAtLsn(vec![(
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
)]),
|
||||
};
|
||||
assert_eq!(res, expected_res);
|
||||
|
||||
let history = vec![
|
||||
(
|
||||
key,
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x60),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
|
||||
),
|
||||
(
|
||||
key,
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
),
|
||||
];
|
||||
let res = tline
|
||||
.generate_key_retention(
|
||||
key,
|
||||
&history,
|
||||
Lsn(0x60),
|
||||
&[Lsn(0x30)],
|
||||
3,
|
||||
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let expected_res = KeyHistoryRetention {
|
||||
below_horizon: vec![
|
||||
(
|
||||
Lsn(0x30),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
|
||||
)]),
|
||||
),
|
||||
(
|
||||
Lsn(0x60),
|
||||
KeyLogAtLsn(vec![(
|
||||
Lsn(0x60),
|
||||
Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
|
||||
)]),
|
||||
),
|
||||
],
|
||||
above_horizon: KeyLogAtLsn(vec![(
|
||||
Lsn(0x70),
|
||||
Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
|
||||
)]),
|
||||
};
|
||||
assert_eq!(res, expected_res);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -7674,6 +7930,10 @@ mod tests {
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
@@ -7684,7 +7944,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x30), &ctx)
|
||||
.get(get_key(idx as u32), gc_horizon, &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
@@ -7709,7 +7969,232 @@ mod tests {
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
tline.compact_with_gc(&cancel, &ctx).await.unwrap();
|
||||
let mut dryrun_flags = EnumSet::new();
|
||||
dryrun_flags.insert(CompactFlags::DryRun);
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, dryrun_flags, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
|
||||
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
|
||||
verify_result().await;
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
// compact again
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x38);
|
||||
guard.cutoffs.space = Lsn(0x38);
|
||||
}
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
|
||||
|
||||
// not increasing the GC horizon and compact again
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let parent_tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // delta layers
|
||||
vec![(Lsn(0x18), img_layer)], // image layers
|
||||
Lsn(0x18),
|
||||
)
|
||||
.await?;
|
||||
|
||||
parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
|
||||
|
||||
let branch_tline = tenant
|
||||
.branch_timeline_test_with_layers(
|
||||
&parent_tline,
|
||||
NEW_TIMELINE_ID,
|
||||
Some(Lsn(0x18)),
|
||||
&ctx,
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
|
||||
branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
|
||||
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = parent_tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x10),
|
||||
space: Lsn(0x10),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = branch_tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x50),
|
||||
space: Lsn(0x50),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_40 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10@0x30"),
|
||||
Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10@0x20"),
|
||||
Bytes::from_static(b"value 6@0x10@0x20"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
branch_tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
branch_tline
|
||||
.get(get_key(idx as u32), Lsn(0x40), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_40[idx]
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
branch_tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
verify_result().await;
|
||||
|
||||
|
||||
213
pageserver/src/tenant/gc_block.rs
Normal file
213
pageserver/src/tenant/gc_block.rs
Normal file
@@ -0,0 +1,213 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::remote_timeline_client::index::GcBlockingReason;
|
||||
|
||||
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct GcBlock {
|
||||
/// The timelines which have current reasons to block gc.
|
||||
///
|
||||
/// LOCK ORDER: this is held locked while scheduling the next index_part update. This is done
|
||||
/// to keep the this field up to date with RemoteTimelineClient `upload_queue.dirty`.
|
||||
reasons: std::sync::Mutex<Storage>,
|
||||
blocking: tokio::sync::Mutex<()>,
|
||||
}
|
||||
|
||||
impl GcBlock {
|
||||
/// Start another gc iteration.
|
||||
///
|
||||
/// Returns a guard to be held for the duration of gc iteration to allow synchronizing with
|
||||
/// it's ending, or if not currently possible, a value describing the reasons why not.
|
||||
///
|
||||
/// Cancellation safe.
|
||||
pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
|
||||
let reasons = {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
// TODO: the assumption is that this method gets called periodically. in prod, we use 1h, in
|
||||
// tests, we use everything. we should warn if the gc has been consecutively blocked
|
||||
// for more than 1h (within single tenant session?).
|
||||
BlockingReasons::clean_and_summarize(g)
|
||||
};
|
||||
|
||||
if let Some(reasons) = reasons {
|
||||
Err(reasons)
|
||||
} else {
|
||||
Ok(Guard {
|
||||
_inner: self.blocking.lock().await,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn summary(&self) -> Option<BlockingReasons> {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
BlockingReasons::summarize(&g)
|
||||
}
|
||||
|
||||
/// Start blocking gc for this one timeline for the given reason.
|
||||
///
|
||||
/// This is not a guard based API but instead it mimics set API. The returned future will not
|
||||
/// resolve until an existing gc round has completed.
|
||||
///
|
||||
/// Returns true if this block was new, false if gc was already blocked for this reason.
|
||||
///
|
||||
/// Cancellation safe: cancelling after first poll will keep the reason to block gc, but will
|
||||
/// keep the gc blocking reason.
|
||||
pub(crate) async fn insert(
|
||||
&self,
|
||||
timeline: &super::Timeline,
|
||||
reason: GcBlockingReason,
|
||||
) -> anyhow::Result<bool> {
|
||||
let (added, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
let set = g.entry(timeline.timeline_id).or_default();
|
||||
let added = set.insert(reason);
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock, see self.reasons.
|
||||
let uploaded = timeline
|
||||
.remote_client
|
||||
.schedule_insert_gc_block_reason(reason)?;
|
||||
|
||||
(added, uploaded)
|
||||
};
|
||||
|
||||
uploaded.await?;
|
||||
|
||||
// ensure that any ongoing gc iteration has completed
|
||||
drop(self.blocking.lock().await);
|
||||
|
||||
Ok(added)
|
||||
}
|
||||
|
||||
/// Remove blocking gc for this one timeline and the given reason.
|
||||
pub(crate) async fn remove(
|
||||
&self,
|
||||
timeline: &super::Timeline,
|
||||
reason: GcBlockingReason,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::collections::hash_map::Entry;
|
||||
|
||||
super::span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let (remaining_blocks, uploaded) = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
match g.entry(timeline.timeline_id) {
|
||||
Entry::Occupied(mut oe) => {
|
||||
let set = oe.get_mut();
|
||||
set.remove(reason);
|
||||
if set.is_empty() {
|
||||
oe.remove();
|
||||
}
|
||||
}
|
||||
Entry::Vacant(_) => {
|
||||
// we must still do the index_part.json update regardless, in case we had earlier
|
||||
// been cancelled
|
||||
}
|
||||
}
|
||||
|
||||
let remaining_blocks = g.len();
|
||||
|
||||
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
|
||||
let uploaded = timeline
|
||||
.remote_client
|
||||
.schedule_remove_gc_block_reason(reason)?;
|
||||
|
||||
(remaining_blocks, uploaded)
|
||||
};
|
||||
uploaded.await?;
|
||||
|
||||
// no need to synchronize with gc iteration again
|
||||
|
||||
if remaining_blocks > 0 {
|
||||
tracing::info!(remaining_blocks, removed=?reason, "gc blocking removed, but gc remains blocked");
|
||||
} else {
|
||||
tracing::info!("gc is now unblocked for the tenant");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
|
||||
let unblocked = {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
if g.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
g.remove(&timeline.timeline_id);
|
||||
|
||||
BlockingReasons::clean_and_summarize(g).is_none()
|
||||
};
|
||||
|
||||
if unblocked {
|
||||
tracing::info!("gc is now unblocked following deletion");
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize with the non-deleted timelines of this tenant.
|
||||
pub(crate) fn set_scanned(&self, scanned: Storage) {
|
||||
let mut g = self.reasons.lock().unwrap();
|
||||
assert!(g.is_empty());
|
||||
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
|
||||
|
||||
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
|
||||
tracing::info!(summary=?reasons, "initialized with gc blocked");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct Guard<'a> {
|
||||
_inner: tokio::sync::MutexGuard<'a, ()>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct BlockingReasons {
|
||||
timelines: usize,
|
||||
reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BlockingReasons {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{} timelines block for {:?}",
|
||||
self.timelines, self.reasons
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockingReasons {
|
||||
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
let mut reasons = enumset::EnumSet::empty();
|
||||
g.retain(|_key, value| {
|
||||
reasons = reasons.union(*value);
|
||||
!value.is_empty()
|
||||
});
|
||||
if !g.is_empty() {
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
reasons,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
|
||||
if g.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let reasons = g
|
||||
.values()
|
||||
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
|
||||
Some(BlockingReasons {
|
||||
timelines: g.len(),
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -51,7 +51,8 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
@@ -61,7 +62,7 @@ use utils::lsn::Lsn;
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
pub use historic_layer_coverage::LayerKey;
|
||||
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
@@ -871,11 +872,183 @@ impl LayerMap {
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `read_points` represent the tip of a timeline and any branch points, i.e. the places
|
||||
/// where we expect to serve reads.
|
||||
///
|
||||
/// This function is O(N) and should be called infrequently. The caller is responsible for
|
||||
/// looking up and updating the Layer objects for these layer descriptors.
|
||||
pub fn get_visibility(
|
||||
&self,
|
||||
mut read_points: Vec<Lsn>,
|
||||
) -> (
|
||||
Vec<(Arc<PersistentLayerDesc>, LayerVisibilityHint)>,
|
||||
KeySpace,
|
||||
) {
|
||||
// This is like a KeySpace, but this type is intended for efficient unions with image layer ranges, whereas
|
||||
// KeySpace is intended to be composed statically and iterated over.
|
||||
struct KeyShadow {
|
||||
// Map of range start to range end
|
||||
inner: RangeSetBlaze<i128>,
|
||||
}
|
||||
|
||||
impl KeyShadow {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
inner: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn contains(&self, range: Range<Key>) -> bool {
|
||||
let range_incl = range.start.to_i128()..=range.end.to_i128() - 1;
|
||||
self.inner.is_superset(&RangeSetBlaze::from_sorted_disjoint(
|
||||
CheckSortedDisjoint::from([range_incl]),
|
||||
))
|
||||
}
|
||||
|
||||
/// Add the input range to the keys covered by self.
|
||||
///
|
||||
/// Return true if inserting this range covered some keys that were previously not covered
|
||||
fn cover(&mut self, insert: Range<Key>) -> bool {
|
||||
let range_incl = insert.start.to_i128()..=insert.end.to_i128() - 1;
|
||||
self.inner.ranges_insert(range_incl)
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.inner = Default::default();
|
||||
}
|
||||
|
||||
fn to_keyspace(&self) -> KeySpace {
|
||||
let mut accum = KeySpaceAccum::new();
|
||||
for range_incl in self.inner.ranges() {
|
||||
let range = Range {
|
||||
start: Key::from_i128(*range_incl.start()),
|
||||
end: Key::from_i128(range_incl.end() + 1),
|
||||
};
|
||||
accum.add_range(range)
|
||||
}
|
||||
|
||||
accum.to_keyspace()
|
||||
}
|
||||
}
|
||||
|
||||
// The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
|
||||
// and a ReadPoint
|
||||
read_points.sort_by_key(|rp| rp.0);
|
||||
let mut shadow = KeyShadow::new();
|
||||
|
||||
// We will interleave all our read points and layers into a sorted collection
|
||||
enum Item {
|
||||
ReadPoint { lsn: Lsn },
|
||||
Layer(Arc<PersistentLayerDesc>),
|
||||
}
|
||||
|
||||
let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
|
||||
items.extend(self.iter_historic_layers().map(Item::Layer));
|
||||
items.extend(
|
||||
read_points
|
||||
.into_iter()
|
||||
.map(|rp| Item::ReadPoint { lsn: rp }),
|
||||
);
|
||||
|
||||
// Ordering: we want to iterate like this:
|
||||
// 1. Highest LSNs first
|
||||
// 2. Consider images before deltas if they end at the same LSNs (images cover deltas)
|
||||
// 3. Consider ReadPoints before image layers if they're at the same LSN (readpoints make that image visible)
|
||||
items.sort_by_key(|item| {
|
||||
std::cmp::Reverse(match item {
|
||||
Item::Layer(layer) => {
|
||||
if layer.is_delta() {
|
||||
(Lsn(layer.get_lsn_range().end.0 - 1), 0)
|
||||
} else {
|
||||
(layer.image_layer_lsn(), 1)
|
||||
}
|
||||
}
|
||||
Item::ReadPoint { lsn } => (*lsn, 2),
|
||||
})
|
||||
});
|
||||
|
||||
let mut results = Vec::with_capacity(self.historic.len());
|
||||
|
||||
let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
|
||||
|
||||
for item in items {
|
||||
let (reached_lsn, is_readpoint) = match &item {
|
||||
Item::ReadPoint { lsn } => (lsn, true),
|
||||
Item::Layer(layer) => (&layer.lsn_range.start, false),
|
||||
};
|
||||
maybe_covered_deltas.retain(|d| {
|
||||
if *reached_lsn >= d.lsn_range.start && is_readpoint {
|
||||
// We encountered a readpoint within the delta layer: it is visible
|
||||
|
||||
results.push((d.clone(), LayerVisibilityHint::Visible));
|
||||
false
|
||||
} else if *reached_lsn < d.lsn_range.start {
|
||||
// We passed the layer's range without encountering a read point: it is not visible
|
||||
results.push((d.clone(), LayerVisibilityHint::Covered));
|
||||
false
|
||||
} else {
|
||||
// We're still in the delta layer: continue iterating
|
||||
true
|
||||
}
|
||||
});
|
||||
|
||||
match item {
|
||||
Item::ReadPoint { lsn: _lsn } => {
|
||||
// TODO: propagate the child timeline's shadow from their own run of this function, so that we don't have
|
||||
// to assume that the whole key range is visible at the branch point.
|
||||
shadow.reset();
|
||||
}
|
||||
Item::Layer(layer) => {
|
||||
let visibility = if layer.is_delta() {
|
||||
if shadow.contains(layer.get_key_range()) {
|
||||
// If a layer isn't visible based on current state, we must defer deciding whether
|
||||
// it is truly not visible until we have advanced past the delta's range: we might
|
||||
// encounter another branch point within this delta layer's LSN range.
|
||||
maybe_covered_deltas.push(layer);
|
||||
continue;
|
||||
} else {
|
||||
LayerVisibilityHint::Visible
|
||||
}
|
||||
} else {
|
||||
let modified = shadow.cover(layer.get_key_range());
|
||||
if modified {
|
||||
// An image layer in a region which wasn't fully covered yet: this layer is visible, but layers below it will be covered
|
||||
LayerVisibilityHint::Visible
|
||||
} else {
|
||||
// An image layer in a region that was already covered
|
||||
LayerVisibilityHint::Covered
|
||||
}
|
||||
};
|
||||
|
||||
results.push((layer, visibility));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drain any remaining maybe_covered deltas
|
||||
results.extend(
|
||||
maybe_covered_deltas
|
||||
.into_iter()
|
||||
.map(|d| (d, LayerVisibilityHint::Covered)),
|
||||
);
|
||||
|
||||
(results, shadow.to_keyspace())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use crate::tenant::{storage_layer::LayerName, IndexPart};
|
||||
use pageserver_api::{
|
||||
key::DBDIR_KEY,
|
||||
keyspace::{KeySpace, KeySpaceRandomAccum},
|
||||
};
|
||||
use std::{collections::HashMap, path::PathBuf};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1002,4 +1175,299 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layer_visibility_basic() {
|
||||
// A simple synthetic input, as a smoke test.
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
let timeline_id = TimelineId::generate();
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut updates = layer_map.batch_update();
|
||||
|
||||
const FAKE_LAYER_SIZE: u64 = 1024;
|
||||
|
||||
let inject_delta = |updates: &mut BatchedUpdates,
|
||||
key_start: i128,
|
||||
key_end: i128,
|
||||
lsn_start: u64,
|
||||
lsn_end: u64| {
|
||||
let desc = PersistentLayerDesc::new_delta(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
Range {
|
||||
start: Key::from_i128(key_start),
|
||||
end: Key::from_i128(key_end),
|
||||
},
|
||||
Range {
|
||||
start: Lsn(lsn_start),
|
||||
end: Lsn(lsn_end),
|
||||
},
|
||||
1024,
|
||||
);
|
||||
updates.insert_historic(desc.clone());
|
||||
desc
|
||||
};
|
||||
|
||||
let inject_image =
|
||||
|updates: &mut BatchedUpdates, key_start: i128, key_end: i128, lsn: u64| {
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
Range {
|
||||
start: Key::from_i128(key_start),
|
||||
end: Key::from_i128(key_end),
|
||||
},
|
||||
Lsn(lsn),
|
||||
FAKE_LAYER_SIZE,
|
||||
);
|
||||
updates.insert_historic(desc.clone());
|
||||
desc
|
||||
};
|
||||
|
||||
//
|
||||
// Construct our scenario: the following lines go in backward-LSN order, constructing the various scenarios
|
||||
// we expect to handle. You can follow these examples through in the same order as they would be processed
|
||||
// by the function under test.
|
||||
//
|
||||
|
||||
let mut read_points = vec![Lsn(1000)];
|
||||
|
||||
// A delta ahead of any image layer
|
||||
let ahead_layer = inject_delta(&mut updates, 10, 20, 101, 110);
|
||||
|
||||
// An image layer is visible and covers some layers beneath itself
|
||||
let visible_covering_img = inject_image(&mut updates, 5, 25, 99);
|
||||
|
||||
// A delta layer covered by the image layer: should be covered
|
||||
let covered_delta = inject_delta(&mut updates, 10, 20, 90, 100);
|
||||
|
||||
// A delta layer partially covered by an image layer: should be visible
|
||||
let partially_covered_delta = inject_delta(&mut updates, 1, 7, 90, 100);
|
||||
|
||||
// A delta layer not covered by an image layer: should be visible
|
||||
let not_covered_delta = inject_delta(&mut updates, 1, 4, 90, 100);
|
||||
|
||||
// An image layer covered by the image layer above: should be covered
|
||||
let covered_image = inject_image(&mut updates, 10, 20, 89);
|
||||
|
||||
// An image layer partially covered by an image layer: should be visible
|
||||
let partially_covered_image = inject_image(&mut updates, 1, 7, 89);
|
||||
|
||||
// An image layer not covered by an image layer: should be visible
|
||||
let not_covered_image = inject_image(&mut updates, 1, 4, 89);
|
||||
|
||||
// A read point: this will make subsequent layers below here visible, even if there are
|
||||
// more recent layers covering them.
|
||||
read_points.push(Lsn(80));
|
||||
|
||||
// A delta layer covered by an earlier image layer, but visible to a readpoint below that covering layer
|
||||
let covered_delta_below_read_point = inject_delta(&mut updates, 10, 20, 70, 79);
|
||||
|
||||
// A delta layer whose end LSN is covered, but where a read point is present partway through its LSN range:
|
||||
// the read point should make it visible, even though its end LSN is covered
|
||||
let covering_img_between_read_points = inject_image(&mut updates, 10, 20, 69);
|
||||
let covered_delta_between_read_points = inject_delta(&mut updates, 10, 15, 67, 69);
|
||||
read_points.push(Lsn(65));
|
||||
let covered_delta_intersects_read_point = inject_delta(&mut updates, 15, 20, 60, 69);
|
||||
|
||||
let visible_img_after_last_read_point = inject_image(&mut updates, 10, 20, 65);
|
||||
|
||||
updates.flush();
|
||||
|
||||
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
|
||||
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
|
||||
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&ahead_layer),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&visible_covering_img),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_delta),
|
||||
Some(&LayerVisibilityHint::Covered)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&partially_covered_delta),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(¬_covered_delta),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_image),
|
||||
Some(&LayerVisibilityHint::Covered)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&partially_covered_image),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(¬_covered_image),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_delta_below_read_point),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covering_img_between_read_points),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_delta_between_read_points),
|
||||
Some(&LayerVisibilityHint::Covered)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&covered_delta_intersects_read_point),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
assert_eq!(
|
||||
layer_visibilities.get(&visible_img_after_last_read_point),
|
||||
Some(&LayerVisibilityHint::Visible)
|
||||
);
|
||||
|
||||
// Shadow should include all the images below the last read point
|
||||
let expected_shadow = KeySpace {
|
||||
ranges: vec![Key::from_i128(10)..Key::from_i128(20)],
|
||||
};
|
||||
assert_eq!(shadow, expected_shadow);
|
||||
}
|
||||
|
||||
fn fixture_path(relative: &str) -> PathBuf {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(relative)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layer_visibility_realistic() {
|
||||
// Load a large example layermap
|
||||
let index_raw = std::fs::read_to_string(fixture_path(
|
||||
"test_data/indices/mixed_workload/index_part.json",
|
||||
))
|
||||
.unwrap();
|
||||
let index: IndexPart = serde_json::from_str::<IndexPart>(&index_raw).unwrap();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut updates = layer_map.batch_update();
|
||||
for (layer_name, layer_metadata) in index.layer_metadata {
|
||||
let layer_desc = match layer_name {
|
||||
LayerName::Image(layer_name) => PersistentLayerDesc {
|
||||
key_range: layer_name.key_range.clone(),
|
||||
lsn_range: layer_name.lsn_as_range(),
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
is_delta: false,
|
||||
file_size: layer_metadata.file_size,
|
||||
},
|
||||
LayerName::Delta(layer_name) => PersistentLayerDesc {
|
||||
key_range: layer_name.key_range,
|
||||
lsn_range: layer_name.lsn_range,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
is_delta: true,
|
||||
file_size: layer_metadata.file_size,
|
||||
},
|
||||
};
|
||||
updates.insert_historic(layer_desc);
|
||||
}
|
||||
updates.flush();
|
||||
|
||||
let read_points = vec![index.metadata.disk_consistent_lsn()];
|
||||
let (layer_visibilities, shadow) = layer_map.get_visibility(read_points);
|
||||
for (layer_desc, visibility) in &layer_visibilities {
|
||||
tracing::info!("{layer_desc:?}: {visibility:?}");
|
||||
eprintln!("{layer_desc:?}: {visibility:?}");
|
||||
}
|
||||
|
||||
// The shadow should be non-empty, since there were some image layers
|
||||
assert!(!shadow.ranges.is_empty());
|
||||
|
||||
// At least some layers should be marked covered
|
||||
assert!(layer_visibilities
|
||||
.iter()
|
||||
.any(|i| matches!(i.1, LayerVisibilityHint::Covered)));
|
||||
|
||||
let layer_visibilities = layer_visibilities.into_iter().collect::<HashMap<_, _>>();
|
||||
|
||||
// Brute force validation: a layer should be marked covered if and only if there are image layers above it in LSN order which cover it
|
||||
for (layer_desc, visible) in &layer_visibilities {
|
||||
let mut coverage = KeySpaceRandomAccum::new();
|
||||
let mut covered_by = Vec::new();
|
||||
|
||||
for other_layer in layer_map.iter_historic_layers() {
|
||||
if &other_layer == layer_desc {
|
||||
continue;
|
||||
}
|
||||
if !other_layer.is_delta()
|
||||
&& other_layer.image_layer_lsn() >= Lsn(layer_desc.get_lsn_range().end.0 - 1)
|
||||
&& other_layer.key_range.start <= layer_desc.key_range.end
|
||||
&& layer_desc.key_range.start <= other_layer.key_range.end
|
||||
{
|
||||
coverage.add_range(other_layer.get_key_range());
|
||||
covered_by.push((*other_layer).clone());
|
||||
}
|
||||
}
|
||||
let coverage = coverage.to_keyspace();
|
||||
|
||||
let expect_visible = if coverage.ranges.len() == 1
|
||||
&& coverage.contains(&layer_desc.key_range.start)
|
||||
&& coverage.contains(&Key::from_i128(layer_desc.key_range.end.to_i128() - 1))
|
||||
{
|
||||
LayerVisibilityHint::Covered
|
||||
} else {
|
||||
LayerVisibilityHint::Visible
|
||||
};
|
||||
|
||||
if expect_visible != *visible {
|
||||
eprintln!(
|
||||
"Layer {}..{} @ {}..{} (delta={}) is {visible:?}, should be {expect_visible:?}",
|
||||
layer_desc.key_range.start,
|
||||
layer_desc.key_range.end,
|
||||
layer_desc.lsn_range.start,
|
||||
layer_desc.lsn_range.end,
|
||||
layer_desc.is_delta()
|
||||
);
|
||||
if expect_visible == LayerVisibilityHint::Covered {
|
||||
eprintln!("Covered by:");
|
||||
for other in covered_by {
|
||||
eprintln!(
|
||||
" {}..{} @ {}",
|
||||
other.get_key_range().start,
|
||||
other.get_key_range().end,
|
||||
other.image_layer_lsn()
|
||||
);
|
||||
}
|
||||
if let Some(range) = coverage.ranges.first() {
|
||||
eprintln!(
|
||||
"Total coverage from contributing layers: {}..{}",
|
||||
range.start, range.end
|
||||
);
|
||||
} else {
|
||||
eprintln!(
|
||||
"Total coverage from contributing layers: {:?}",
|
||||
coverage.ranges
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
assert_eq!(expect_visible, *visible);
|
||||
}
|
||||
|
||||
// Sanity: the layer that holds latest data for the DBDIR key should always be visible
|
||||
// (just using this key as a key that will always exist for any layermap fixture)
|
||||
let dbdir_layer = layer_map
|
||||
.search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
|
||||
.unwrap();
|
||||
assert!(matches!(
|
||||
layer_visibilities.get(&dbdir_layer.layer).unwrap(),
|
||||
LayerVisibilityHint::Visible
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -521,6 +521,10 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
|
||||
Ok(&self.historic_coverage)
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.layers.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use super::remote_timeline_client::remote_tenant_path;
|
||||
use super::secondary::SecondaryTenant;
|
||||
use super::timeline::detach_ancestor::PreparedTimelineDetach;
|
||||
use super::TenantSharedResources;
|
||||
use super::{GlobalShutDown, TenantSharedResources};
|
||||
|
||||
/// For a tenant that appears in TenantsMap, it may either be
|
||||
/// - `Attached`: has a full Tenant object, is elegible to service
|
||||
@@ -116,8 +116,6 @@ pub(crate) enum ShardSelector {
|
||||
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
||||
/// ignore it.
|
||||
Zero,
|
||||
/// Pick the first shard we find for the TenantId
|
||||
First,
|
||||
/// Pick the shard that holds this key
|
||||
Page(Key),
|
||||
/// The shard ID is known: pick the given shard
|
||||
@@ -667,17 +665,20 @@ pub async fn init_tenant_mgr(
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
|
||||
let shard_identity = location_conf.shard;
|
||||
let slot = match location_conf.mode {
|
||||
LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
)),
|
||||
LocationMode::Attached(attached_conf) => TenantSlot::Attached(
|
||||
tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
SpawnMode::Lazy,
|
||||
&ctx,
|
||||
)
|
||||
.expect("global shutdown during init_tenant_mgr cannot happen"),
|
||||
),
|
||||
LocationMode::Secondary(secondary_conf) => {
|
||||
info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
@@ -725,7 +726,7 @@ fn tenant_spawn(
|
||||
init_order: Option<InitializationOrder>,
|
||||
mode: SpawnMode,
|
||||
ctx: &RequestContext,
|
||||
) -> Arc<Tenant> {
|
||||
) -> Result<Arc<Tenant>, GlobalShutDown> {
|
||||
// All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
|
||||
// path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode
|
||||
// to avoid impacting prod runtime performance.
|
||||
@@ -1192,7 +1193,10 @@ impl TenantManager {
|
||||
None,
|
||||
spawn_mode,
|
||||
ctx,
|
||||
);
|
||||
)
|
||||
.map_err(|_: GlobalShutDown| {
|
||||
UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
|
||||
})?;
|
||||
|
||||
TenantSlot::Attached(tenant)
|
||||
}
|
||||
@@ -1313,7 +1317,7 @@ impl TenantManager {
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
);
|
||||
)?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
@@ -2047,7 +2051,7 @@ impl TenantManager {
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
);
|
||||
)?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
@@ -2088,7 +2092,6 @@ impl TenantManager {
|
||||
};
|
||||
|
||||
match selector {
|
||||
ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return ShardResolveResult::Found(tenant.clone())
|
||||
}
|
||||
@@ -2170,6 +2173,9 @@ pub(crate) enum GetActiveTenantError {
|
||||
/// never happen.
|
||||
#[error("Tenant is broken: {0}")]
|
||||
Broken(String),
|
||||
|
||||
#[error("reconnect to switch tenant id")]
|
||||
SwitchedTenant,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
||||
@@ -800,6 +800,123 @@ impl RemoteTimelineClient {
|
||||
.context("wait completion")
|
||||
}
|
||||
|
||||
/// Adds a gc blocking reason for this timeline if one does not exist already.
|
||||
///
|
||||
/// A retryable step of timeline detach ancestor.
|
||||
///
|
||||
/// Returns a future which waits until the completion of the upload.
|
||||
pub(crate) fn schedule_insert_gc_block_reason(
|
||||
self: &Arc<Self>,
|
||||
reason: index::GcBlockingReason,
|
||||
) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
|
||||
{
|
||||
let maybe_barrier = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if let index::GcBlockingReason::DetachAncestor = reason {
|
||||
if upload_queue.dirty.metadata.ancestor_timeline().is_none() {
|
||||
drop(guard);
|
||||
panic!("cannot start detach ancestor if there is nothing to detach from");
|
||||
}
|
||||
}
|
||||
|
||||
let wanted = |x: Option<&index::GcBlocking>| x.is_some_and(|x| x.blocked_by(reason));
|
||||
|
||||
let current = upload_queue.dirty.gc_blocking.as_ref();
|
||||
let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
|
||||
|
||||
match (current, uploaded) {
|
||||
(x, y) if wanted(x) && wanted(y) => None,
|
||||
(x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
|
||||
// Usual case: !wanted(x) && !wanted(y)
|
||||
//
|
||||
// Unusual: !wanted(x) && wanted(y) which means we have two processes waiting to
|
||||
// turn on and off some reason.
|
||||
(x, y) => {
|
||||
if !wanted(x) && wanted(y) {
|
||||
// this could be avoided by having external in-memory synchronization, like
|
||||
// timeline detach ancestor
|
||||
warn!(?reason, op="insert", "unexpected: two racing processes to enable and disable a gc blocking reason");
|
||||
}
|
||||
|
||||
// at this point, the metadata must always show that there is a parent
|
||||
upload_queue.dirty.gc_blocking = current
|
||||
.map(|x| x.with_reason(reason))
|
||||
.or_else(|| Some(index::GcBlocking::started_now_for(reason)));
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Some(self.schedule_barrier0(upload_queue))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(async move {
|
||||
if let Some(barrier) = maybe_barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
/// Removes a gc blocking reason for this timeline if one exists.
|
||||
///
|
||||
/// A retryable step of timeline detach ancestor.
|
||||
///
|
||||
/// Returns a future which waits until the completion of the upload.
|
||||
pub(crate) fn schedule_remove_gc_block_reason(
|
||||
self: &Arc<Self>,
|
||||
reason: index::GcBlockingReason,
|
||||
) -> Result<impl std::future::Future<Output = Result<(), WaitCompletionError>>, NotInitialized>
|
||||
{
|
||||
let maybe_barrier = {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
if let index::GcBlockingReason::DetachAncestor = reason {
|
||||
if !upload_queue
|
||||
.clean
|
||||
.0
|
||||
.lineage
|
||||
.is_detached_from_original_ancestor()
|
||||
{
|
||||
drop(guard);
|
||||
panic!("cannot complete timeline_ancestor_detach while not detached");
|
||||
}
|
||||
}
|
||||
|
||||
let wanted = |x: Option<&index::GcBlocking>| {
|
||||
x.is_none() || x.is_some_and(|b| !b.blocked_by(reason))
|
||||
};
|
||||
|
||||
let current = upload_queue.dirty.gc_blocking.as_ref();
|
||||
let uploaded = upload_queue.clean.0.gc_blocking.as_ref();
|
||||
|
||||
match (current, uploaded) {
|
||||
(x, y) if wanted(x) && wanted(y) => None,
|
||||
(x, y) if wanted(x) && !wanted(y) => Some(self.schedule_barrier0(upload_queue)),
|
||||
(x, y) => {
|
||||
if !wanted(x) && wanted(y) {
|
||||
warn!(?reason, op="remove", "unexpected: two racing processes to enable and disable a gc blocking reason (remove)");
|
||||
}
|
||||
|
||||
upload_queue.dirty.gc_blocking =
|
||||
current.as_ref().and_then(|x| x.without_reason(reason));
|
||||
assert!(wanted(upload_queue.dirty.gc_blocking.as_ref()));
|
||||
// FIXME: bogus ?
|
||||
self.schedule_index_upload(upload_queue)?;
|
||||
Some(self.schedule_barrier0(upload_queue))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(async move {
|
||||
if let Some(barrier) = maybe_barrier {
|
||||
Self::wait_completion0(barrier).await?;
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
/// Launch an upload operation in the background; the file is added to be included in next
|
||||
/// `index_part.json` upload.
|
||||
pub(crate) fn schedule_layer_file_upload(
|
||||
@@ -1378,6 +1495,18 @@ impl RemoteTimelineClient {
|
||||
.dirty
|
||||
.layer_metadata
|
||||
.drain()
|
||||
.filter(|(_file_name, meta)| {
|
||||
// Filter out layers that belonged to an ancestor shard. Since we are deleting the whole timeline from
|
||||
// all shards anyway, we _could_ delete these, but
|
||||
// - it creates a potential race if other shards are still
|
||||
// using the layers while this shard deletes them.
|
||||
// - it means that if we rolled back the shard split, the ancestor shards would be in a state where
|
||||
// these timelines are present but corrupt (their index exists but some layers don't)
|
||||
//
|
||||
// These layers will eventually be cleaned up by the scrubber when it does physical GC.
|
||||
meta.shard.shard_number == self.tenant_shard_id.shard_number
|
||||
&& meta.shard.shard_count == self.tenant_shard_id.shard_count
|
||||
})
|
||||
.map(|(file_name, meta)| {
|
||||
remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
|
||||
@@ -60,6 +60,9 @@ pub struct IndexPart {
|
||||
#[serde(default)]
|
||||
pub(crate) lineage: Lineage,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub(crate) gc_blocking: Option<GcBlocking>,
|
||||
|
||||
/// Describes the kind of aux files stored in the timeline.
|
||||
///
|
||||
/// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
|
||||
@@ -85,10 +88,11 @@ impl IndexPart {
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
/// - 7: metadata_bytes is no longer written, but still read
|
||||
/// - 8: added `archived_at`
|
||||
const LATEST_VERSION: usize = 8;
|
||||
/// - 9: +gc_blocking
|
||||
const LATEST_VERSION: usize = 9;
|
||||
|
||||
// Versions we may see when reading from a bucket.
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8];
|
||||
pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
|
||||
pub const FILE_NAME: &'static str = "index_part.json";
|
||||
|
||||
@@ -101,6 +105,7 @@ impl IndexPart {
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
}
|
||||
}
|
||||
@@ -251,6 +256,64 @@ impl Lineage {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub(crate) struct GcBlocking {
|
||||
pub(crate) started_at: NaiveDateTime,
|
||||
pub(crate) reasons: enumset::EnumSet<GcBlockingReason>,
|
||||
}
|
||||
|
||||
#[derive(Debug, enumset::EnumSetType, serde::Serialize, serde::Deserialize)]
|
||||
#[enumset(serialize_repr = "list")]
|
||||
pub(crate) enum GcBlockingReason {
|
||||
Manual,
|
||||
DetachAncestor,
|
||||
}
|
||||
|
||||
impl GcBlocking {
|
||||
pub(super) fn started_now_for(reason: GcBlockingReason) -> Self {
|
||||
GcBlocking {
|
||||
started_at: chrono::Utc::now().naive_utc(),
|
||||
reasons: enumset::EnumSet::only(reason),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the given reason is one of the reasons why the gc is blocked.
|
||||
pub(crate) fn blocked_by(&self, reason: GcBlockingReason) -> bool {
|
||||
self.reasons.contains(reason)
|
||||
}
|
||||
|
||||
/// Returns a version of self with the given reason.
|
||||
pub(super) fn with_reason(&self, reason: GcBlockingReason) -> Self {
|
||||
assert!(!self.blocked_by(reason));
|
||||
let mut reasons = self.reasons;
|
||||
reasons.insert(reason);
|
||||
|
||||
Self {
|
||||
started_at: self.started_at,
|
||||
reasons,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a version of self without the given reason. Assumption is that if
|
||||
/// there are no more reasons, we can unblock the gc by returning `None`.
|
||||
pub(super) fn without_reason(&self, reason: GcBlockingReason) -> Option<Self> {
|
||||
assert!(self.blocked_by(reason));
|
||||
|
||||
if self.reasons.len() == 1 {
|
||||
None
|
||||
} else {
|
||||
let mut reasons = self.reasons;
|
||||
assert!(reasons.remove(reason));
|
||||
assert!(!reasons.is_empty());
|
||||
|
||||
Some(Self {
|
||||
started_at: self.started_at,
|
||||
reasons,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -292,6 +355,7 @@ mod tests {
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -335,6 +399,7 @@ mod tests {
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -379,6 +444,7 @@ mod tests {
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -426,6 +492,7 @@ mod tests {
|
||||
deleted_at: None,
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -468,6 +535,7 @@ mod tests {
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Lineage::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -513,6 +581,7 @@ mod tests {
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: None,
|
||||
};
|
||||
|
||||
@@ -563,6 +632,7 @@ mod tests {
|
||||
reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
|
||||
original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
|
||||
},
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
};
|
||||
|
||||
@@ -618,6 +688,7 @@ mod tests {
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
@@ -674,6 +745,7 @@ mod tests {
|
||||
deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
|
||||
archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
|
||||
lineage: Default::default(),
|
||||
gc_blocking: None,
|
||||
last_aux_file_policy: Default::default(),
|
||||
};
|
||||
|
||||
@@ -681,6 +753,68 @@ mod tests {
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn v9_indexpart_is_parsed() {
|
||||
let example = r#"{
|
||||
"version": 9,
|
||||
"layer_metadata":{
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
|
||||
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
|
||||
},
|
||||
"disk_consistent_lsn":"0/16960E8",
|
||||
"metadata": {
|
||||
"disk_consistent_lsn": "0/16960E8",
|
||||
"prev_record_lsn": "0/1696070",
|
||||
"ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
|
||||
"ancestor_lsn": "0/0",
|
||||
"latest_gc_cutoff_lsn": "0/1696070",
|
||||
"initdb_lsn": "0/1696070",
|
||||
"pg_version": 14
|
||||
},
|
||||
"gc_blocking": {
|
||||
"started_at": "2024-07-19T09:00:00.123",
|
||||
"reasons": ["DetachAncestor"]
|
||||
}
|
||||
}"#;
|
||||
|
||||
let expected = IndexPart {
|
||||
version: 9,
|
||||
layer_metadata: HashMap::from([
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 25600000,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
}),
|
||||
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
|
||||
file_size: 9007199254741001,
|
||||
generation: Generation::none(),
|
||||
shard: ShardIndex::unsharded()
|
||||
})
|
||||
]),
|
||||
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
|
||||
metadata: TimelineMetadata::new(
|
||||
Lsn::from_str("0/16960E8").unwrap(),
|
||||
Some(Lsn::from_str("0/1696070").unwrap()),
|
||||
Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
|
||||
Lsn::INVALID,
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
Lsn::from_str("0/1696070").unwrap(),
|
||||
14,
|
||||
).with_recalculated_checksum().unwrap(),
|
||||
deleted_at: None,
|
||||
lineage: Default::default(),
|
||||
gc_blocking: Some(GcBlocking {
|
||||
started_at: parse_naive_datetime("2024-07-19T09:00:00.123000000"),
|
||||
reasons: enumset::EnumSet::from_iter([GcBlockingReason::DetachAncestor]),
|
||||
}),
|
||||
last_aux_file_policy: Default::default(),
|
||||
archived_at: None,
|
||||
};
|
||||
|
||||
let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
|
||||
assert_eq!(part, expected);
|
||||
}
|
||||
|
||||
fn parse_naive_datetime(s: &str) -> NaiveDateTime {
|
||||
chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
|
||||
}
|
||||
|
||||
@@ -8,6 +8,9 @@ mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
@@ -432,39 +435,18 @@ impl ReadableLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value from [`Layer::get_value_reconstruct_data`]
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum ValueReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue,
|
||||
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing,
|
||||
}
|
||||
|
||||
/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather
|
||||
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
|
||||
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
|
||||
/// be used for cache management but not for correctness-critical checks.
|
||||
#[derive(Default, Debug, Clone, PartialEq, Eq)]
|
||||
pub(crate) enum LayerVisibilityHint {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum LayerVisibilityHint {
|
||||
/// A Visible layer might be read while serving a read, because there is not an image layer between it
|
||||
/// and a readable LSN (the tip of the branch or a child's branch point)
|
||||
Visible,
|
||||
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
|
||||
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
|
||||
#[allow(unused)]
|
||||
Covered,
|
||||
/// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
|
||||
/// in this state. Note that newly written layers may be called Visible immediately, this uninitialized
|
||||
/// state is for when existing layers are constructed while loading a timeline.
|
||||
#[default]
|
||||
Uninitialized,
|
||||
}
|
||||
|
||||
pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
|
||||
@@ -557,19 +539,25 @@ impl LayerAccessStats {
|
||||
self.record_residence_event_at(SystemTime::now())
|
||||
}
|
||||
|
||||
pub(crate) fn record_access_at(&self, now: SystemTime) {
|
||||
fn record_access_at(&self, now: SystemTime) -> bool {
|
||||
let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
|
||||
|
||||
// A layer which is accessed must be visible.
|
||||
mask |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
value |= 0x1 << Self::VISIBILITY_SHIFT;
|
||||
|
||||
self.write_bits(mask, value);
|
||||
let old_bits = self.write_bits(mask, value);
|
||||
!matches!(
|
||||
self.decode_visibility(old_bits),
|
||||
LayerVisibilityHint::Visible
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn record_access(&self, ctx: &RequestContext) {
|
||||
/// Returns true if we modified the layer's visibility to set it to Visible implicitly
|
||||
/// as a result of this access
|
||||
pub(crate) fn record_access(&self, ctx: &RequestContext) -> bool {
|
||||
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
self.record_access_at(SystemTime::now())
|
||||
@@ -626,23 +614,30 @@ impl LayerAccessStats {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
|
||||
let value = match visibility {
|
||||
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
|
||||
LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
|
||||
};
|
||||
|
||||
self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
|
||||
}
|
||||
|
||||
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
||||
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
|
||||
match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
|
||||
/// Helper for extracting the visibility hint from the literal value of our inner u64
|
||||
fn decode_visibility(&self, bits: u64) -> LayerVisibilityHint {
|
||||
match (bits >> Self::VISIBILITY_SHIFT) & 0x1 {
|
||||
1 => LayerVisibilityHint::Visible,
|
||||
0 => LayerVisibilityHint::Covered,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the old value which has been replaced
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) -> LayerVisibilityHint {
|
||||
let value = match visibility {
|
||||
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
|
||||
LayerVisibilityHint::Covered => 0x0,
|
||||
};
|
||||
|
||||
let old_bits = self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
|
||||
self.decode_visibility(old_bits)
|
||||
}
|
||||
|
||||
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
||||
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
|
||||
self.decode_visibility(read)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
|
||||
@@ -36,13 +36,12 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
@@ -72,10 +71,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{
|
||||
AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
use super::{AsLayerDesc, LayerName, PersistentLayerDesc, ValuesReconstructState};
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -200,7 +196,6 @@ impl DeltaKey {
|
||||
pub struct DeltaLayer {
|
||||
path: Utf8PathBuf,
|
||||
pub desc: PersistentLayerDesc,
|
||||
access_stats: LayerAccessStats,
|
||||
inner: OnceCell<Arc<DeltaLayerInner>>,
|
||||
}
|
||||
|
||||
@@ -299,7 +294,6 @@ impl DeltaLayer {
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats.record_access(ctx);
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
@@ -350,7 +344,6 @@ impl DeltaLayer {
|
||||
summary.lsn_range,
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: Default::default(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
@@ -373,7 +366,6 @@ impl DeltaLayer {
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
struct DeltaLayerWriterInner {
|
||||
conf: &'static PageServerConf,
|
||||
pub path: Utf8PathBuf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -384,6 +376,9 @@ struct DeltaLayerWriterInner {
|
||||
tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
|
||||
|
||||
blob_writer: BlobWriter<true>,
|
||||
|
||||
// Number of key-lsns in the layer.
|
||||
num_keys: usize,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriterInner {
|
||||
@@ -417,7 +412,6 @@ impl DeltaLayerWriterInner {
|
||||
let tree_builder = DiskBtreeBuilder::new(block_buf);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
path,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
@@ -425,6 +419,7 @@ impl DeltaLayerWriterInner {
|
||||
lsn_range,
|
||||
tree: tree_builder,
|
||||
blob_writer,
|
||||
num_keys: 0,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -475,6 +470,9 @@ impl DeltaLayerWriterInner {
|
||||
|
||||
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
||||
let res = self.tree.append(&delta_key.0, blob_ref.0);
|
||||
|
||||
self.num_keys += 1;
|
||||
|
||||
(val, res.map_err(|e| anyhow::anyhow!(e)))
|
||||
}
|
||||
|
||||
@@ -488,11 +486,10 @@ impl DeltaLayerWriterInner {
|
||||
async fn finish(
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
let temp_path = self.path.clone();
|
||||
let result = self.finish0(key_end, timeline, ctx).await;
|
||||
let result = self.finish0(key_end, ctx).await;
|
||||
if result.is_err() {
|
||||
tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
@@ -505,9 +502,8 @@ impl DeltaLayerWriterInner {
|
||||
async fn finish0(
|
||||
self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
|
||||
@@ -572,11 +568,9 @@ impl DeltaLayerWriterInner {
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
|
||||
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
|
||||
trace!("created delta layer {}", self.path);
|
||||
|
||||
trace!("created delta layer {}", layer.local_path());
|
||||
|
||||
Ok(layer)
|
||||
Ok((desc, self.path))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -677,14 +671,20 @@ impl DeltaLayerWriter {
|
||||
pub(crate) async fn finish(
|
||||
mut self,
|
||||
key_end: Key,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
self.inner
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(key_end, timeline, ctx)
|
||||
.await
|
||||
) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
|
||||
self.inner.take().unwrap().finish(key_end, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
}
|
||||
}
|
||||
|
||||
@@ -808,95 +808,6 @@ impl DeltaLayerInner {
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
&block_reader,
|
||||
);
|
||||
let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
|
||||
|
||||
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
&search_key.0,
|
||||
VisitDirection::Backwards,
|
||||
|key, value| {
|
||||
let blob_ref = BlobRef(value);
|
||||
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerValue)
|
||||
.build();
|
||||
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let cursor = block_reader.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor
|
||||
.read_blob_into_buf(pos, &mut buf, ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to read blob from virtual file {}", self.file.path)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
self.file.path
|
||||
)
|
||||
})?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
@@ -1669,8 +1580,9 @@ pub(crate) mod test {
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::{
|
||||
context::DownloadBehavior,
|
||||
task_mgr::TaskKind,
|
||||
@@ -1964,9 +1876,8 @@ pub(crate) mod test {
|
||||
res?;
|
||||
}
|
||||
|
||||
let resident = writer
|
||||
.finish(entries_meta.key_range.end, &timeline, &ctx)
|
||||
.await?;
|
||||
let (desc, path) = writer.finish(entries_meta.key_range.end, &ctx).await?;
|
||||
let resident = Layer::finish_creating(harness.conf, &timeline, desc, &path)?;
|
||||
|
||||
let inner = resident.get_as_delta(&ctx).await?;
|
||||
|
||||
@@ -2155,7 +2066,8 @@ pub(crate) mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap();
|
||||
let (desc, path) = writer.finish(Key::MAX, ctx).await.unwrap();
|
||||
let copied_layer = Layer::finish_creating(tenant.conf, &branch, desc, &path).unwrap();
|
||||
|
||||
copied_layer.get_as_delta(ctx).await.unwrap();
|
||||
|
||||
@@ -2283,7 +2195,9 @@ pub(crate) mod test {
|
||||
for (key, lsn, value) in deltas {
|
||||
writer.put_value(key, lsn, value, ctx).await?;
|
||||
}
|
||||
let delta_layer = writer.finish(key_end, tline, ctx).await?;
|
||||
|
||||
let (desc, path) = writer.finish(key_end, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(tenant.conf, tline, desc, &path)?;
|
||||
|
||||
Ok::<_, anyhow::Error>(delta_layer)
|
||||
}
|
||||
|
||||
@@ -32,9 +32,6 @@ use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::{
|
||||
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
@@ -137,7 +134,6 @@ pub struct ImageLayer {
|
||||
pub desc: PersistentLayerDesc,
|
||||
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
|
||||
pub lsn: Lsn,
|
||||
access_stats: LayerAccessStats,
|
||||
inner: OnceCell<ImageLayerInner>,
|
||||
}
|
||||
|
||||
@@ -255,7 +251,6 @@ impl ImageLayer {
|
||||
/// not loaded already.
|
||||
///
|
||||
async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
|
||||
self.access_stats.record_access(ctx);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner(ctx))
|
||||
.await
|
||||
@@ -306,7 +301,6 @@ impl ImageLayer {
|
||||
metadata.len(),
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
access_stats: Default::default(),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
@@ -429,46 +423,6 @@ impl ImageLayerInner {
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
if let Some(offset) = tree_reader
|
||||
.get(
|
||||
&keybuf,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build(),
|
||||
)
|
||||
.await?
|
||||
{
|
||||
let blob = block_reader
|
||||
.block_cursor()
|
||||
.read_blob(
|
||||
offset,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerValue)
|
||||
.build(),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
pub(super) async fn get_values_reconstruct_data(
|
||||
@@ -742,11 +696,21 @@ struct ImageLayerWriterInner {
|
||||
// where we have chosen their compressed form
|
||||
uncompressed_bytes_chosen: u64,
|
||||
|
||||
// Number of keys in the layer.
|
||||
num_keys: usize,
|
||||
|
||||
blob_writer: BlobWriter<false>,
|
||||
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
|
||||
|
||||
#[cfg_attr(not(feature = "testing"), allow(dead_code))]
|
||||
last_written_key: Key,
|
||||
}
|
||||
|
||||
impl ImageLayerWriterInner {
|
||||
fn size(&self) -> u64 {
|
||||
self.tree.borrow_writer().size() + self.blob_writer.size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
@@ -800,6 +764,8 @@ impl ImageLayerWriterInner {
|
||||
uncompressed_bytes: 0,
|
||||
uncompressed_bytes_eligible: 0,
|
||||
uncompressed_bytes_chosen: 0,
|
||||
num_keys: 0,
|
||||
last_written_key: Key::MIN,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
@@ -820,6 +786,7 @@ impl ImageLayerWriterInner {
|
||||
let compression = self.conf.image_compression;
|
||||
let uncompressed_len = img.len() as u64;
|
||||
self.uncompressed_bytes += uncompressed_len;
|
||||
self.num_keys += 1;
|
||||
let (_img, res) = self
|
||||
.blob_writer
|
||||
.write_blob_maybe_compressed(img, ctx, compression)
|
||||
@@ -839,6 +806,11 @@ impl ImageLayerWriterInner {
|
||||
key.write_to_byte_slice(&mut keybuf);
|
||||
self.tree.append(&keybuf, off)?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
self.last_written_key = key;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -849,6 +821,7 @@ impl ImageLayerWriterInner {
|
||||
self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Option<Key>,
|
||||
) -> anyhow::Result<ResidentLayer> {
|
||||
let index_start_blk =
|
||||
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
|
||||
@@ -899,11 +872,23 @@ impl ImageLayerWriterInner {
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
self.key_range.clone(),
|
||||
if let Some(end_key) = end_key {
|
||||
self.key_range.start..end_key
|
||||
} else {
|
||||
self.key_range.clone()
|
||||
},
|
||||
self.lsn,
|
||||
metadata.len(),
|
||||
);
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
if let Some(end_key) = end_key {
|
||||
assert!(
|
||||
self.last_written_key < end_key,
|
||||
"written key violates end_key range"
|
||||
);
|
||||
}
|
||||
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
@@ -980,6 +965,18 @@ impl ImageLayerWriter {
|
||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Estimated size of the image layer.
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the image layer.
|
||||
///
|
||||
@@ -988,7 +985,26 @@ impl ImageLayerWriter {
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner.take().unwrap().finish(timeline, ctx).await
|
||||
self.inner.take().unwrap().finish(timeline, ctx, None).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||
pub(super) async fn finish_with_end_key(
|
||||
mut self,
|
||||
timeline: &Arc<Timeline>,
|
||||
end_key: Key,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<super::ResidentLayer> {
|
||||
self.inner
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(timeline, ctx, Some(end_key))
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,11 +10,11 @@ use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::ValueReconstructResult;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::{l0_flush, page_cache, walrecord};
|
||||
use anyhow::{anyhow, ensure, Result};
|
||||
use anyhow::{anyhow, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -34,8 +34,7 @@ use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use super::{
|
||||
DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
|
||||
ValuesReconstructState,
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||
@@ -55,9 +54,6 @@ pub struct InMemoryLayer {
|
||||
/// Writes are only allowed when this is `None`.
|
||||
pub(crate) end_lsn: OnceLock<Lsn>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer before frozen.
|
||||
local_path_str: Arc<str>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer after frozen.
|
||||
frozen_local_path_str: OnceLock<Arc<str>>,
|
||||
|
||||
@@ -248,12 +244,6 @@ impl InMemoryLayer {
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
}
|
||||
|
||||
pub(crate) fn local_path_str(&self) -> &Arc<str> {
|
||||
self.frozen_local_path_str
|
||||
.get()
|
||||
.unwrap_or(&self.local_path_str)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
@@ -303,60 +293,6 @@ impl InMemoryLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos, &ctx).await?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
// Look up the keys in the provided keyspace and update
|
||||
// the reconstruct state with whatever is found.
|
||||
//
|
||||
@@ -458,11 +394,6 @@ impl InMemoryLayer {
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
local_path_str: {
|
||||
let mut buf = String::new();
|
||||
inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
|
||||
buf.into()
|
||||
},
|
||||
frozen_local_path_str: OnceLock::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
@@ -482,8 +413,7 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
|
||||
pub(crate) async fn put_value(
|
||||
pub async fn put_value(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -548,8 +478,6 @@ impl InMemoryLayer {
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is exclusive
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().await;
|
||||
|
||||
assert!(
|
||||
self.start_lsn < end_lsn,
|
||||
"{} >= {}",
|
||||
@@ -567,9 +495,13 @@ impl InMemoryLayer {
|
||||
})
|
||||
.expect("frozen_local_path_str set only once");
|
||||
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
let inner = self.inner.write().await;
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -579,12 +511,12 @@ impl InMemoryLayer {
|
||||
/// if there are no matching keys.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub(crate) async fn write_to_disk(
|
||||
pub async fn write_to_disk(
|
||||
&self,
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
key_range: Option<Range<Key>>,
|
||||
) -> Result<Option<ResidentLayer>> {
|
||||
l0_flush_global_state: &l0_flush::Inner,
|
||||
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -596,9 +528,8 @@ impl InMemoryLayer {
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match &*l0_flush_global_state {
|
||||
let _concurrency_permit = match l0_flush_global_state {
|
||||
Inner::PageCached => None,
|
||||
Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
|
||||
};
|
||||
@@ -628,7 +559,7 @@ impl InMemoryLayer {
|
||||
)
|
||||
.await?;
|
||||
|
||||
match &*l0_flush_global_state {
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::PageCached => {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
@@ -693,7 +624,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
// MAX is used here because we identify L0 layers by full key range
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
|
||||
let (desc, path) = delta_layer_writer.finish(Key::MAX, ctx).await?;
|
||||
|
||||
// Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
|
||||
//
|
||||
@@ -705,6 +636,6 @@ impl InMemoryLayer {
|
||||
// we dirtied when writing to the filesystem have been flushed and marked !dirty.
|
||||
drop(_concurrency_permit);
|
||||
|
||||
Ok(Some(delta_layer))
|
||||
Ok(Some((desc, path)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ use super::delta_layer::{self, DeltaEntry};
|
||||
use super::image_layer::{self};
|
||||
use super::{
|
||||
AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
|
||||
PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
|
||||
LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
|
||||
};
|
||||
|
||||
use utils::generation::Generation;
|
||||
@@ -246,7 +246,7 @@ impl Layer {
|
||||
&timeline.generation,
|
||||
);
|
||||
|
||||
let layer = LayerInner::new(
|
||||
LayerInner::new(
|
||||
conf,
|
||||
timeline,
|
||||
local_path,
|
||||
@@ -254,14 +254,7 @@ impl Layer {
|
||||
Some(inner),
|
||||
timeline.generation,
|
||||
timeline.get_shard_index(),
|
||||
);
|
||||
|
||||
// Newly created layers are marked visible by default: the usual case is that they were created to be read.
|
||||
layer
|
||||
.access_stats
|
||||
.set_visibility(super::LayerVisibilityHint::Visible);
|
||||
|
||||
layer
|
||||
)
|
||||
}));
|
||||
|
||||
let downloaded = resident.expect("just initialized");
|
||||
@@ -307,42 +300,6 @@ impl Layer {
|
||||
self.0.delete_on_drop();
|
||||
}
|
||||
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
/// It is up to the caller to collect more data from the previous layer and
|
||||
/// perform WAL redo, if necessary.
|
||||
///
|
||||
/// # Cancellation-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
use anyhow::ensure;
|
||||
|
||||
let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
|
||||
self.0.access_stats.record_access(ctx);
|
||||
|
||||
if self.layer_desc().is_delta {
|
||||
ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
|
||||
ensure!(self.layer_desc().key_range.contains(&key));
|
||||
} else {
|
||||
ensure!(self.layer_desc().key_range.contains(&key));
|
||||
ensure!(lsn_range.start >= self.layer_desc().image_layer_lsn());
|
||||
ensure!(lsn_range.end >= self.layer_desc().image_layer_lsn());
|
||||
}
|
||||
|
||||
layer
|
||||
.get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
.instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
|
||||
.await
|
||||
.with_context(|| format!("get_value_reconstruct_data for layer {self}"))
|
||||
}
|
||||
|
||||
pub(crate) async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -359,7 +316,7 @@ impl Layer {
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
|
||||
self.0.access_stats.record_access(ctx);
|
||||
self.record_access(ctx);
|
||||
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
|
||||
@@ -439,18 +396,18 @@ impl Layer {
|
||||
self.0.info(reset)
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
&self.0.access_stats
|
||||
pub(crate) fn latest_activity(&self) -> SystemTime {
|
||||
self.0.access_stats.latest_activity()
|
||||
}
|
||||
|
||||
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
|
||||
self.0.access_stats.visibility()
|
||||
}
|
||||
|
||||
pub(crate) fn local_path(&self) -> &Utf8Path {
|
||||
&self.0.path
|
||||
}
|
||||
|
||||
pub(crate) fn debug_str(&self) -> &Arc<str> {
|
||||
&self.0.debug_str
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
self.0.metadata()
|
||||
}
|
||||
@@ -493,13 +450,57 @@ impl Layer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn record_access(&self, ctx: &RequestContext) {
|
||||
if self.0.access_stats.record_access(ctx) {
|
||||
// Visibility was modified to Visible
|
||||
tracing::info!(
|
||||
"Layer {} became visible as a result of access",
|
||||
self.0.desc.key()
|
||||
);
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
.add(self.0.desc.file_size)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
|
||||
let old_visibility = self.0.access_stats.set_visibility(visibility.clone());
|
||||
use LayerVisibilityHint::*;
|
||||
match (old_visibility, visibility) {
|
||||
(Visible, Covered) => {
|
||||
// Subtract this layer's contribution to the visible size metric
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
debug_assert!(
|
||||
tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
|
||||
);
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
.sub(self.0.desc.file_size)
|
||||
}
|
||||
}
|
||||
(Covered, Visible) => {
|
||||
// Add this layer's contribution to the visible size metric
|
||||
if let Some(tl) = self.0.timeline.upgrade() {
|
||||
tl.metrics
|
||||
.visible_physical_size_gauge
|
||||
.add(self.0.desc.file_size)
|
||||
}
|
||||
}
|
||||
(Covered, Covered) | (Visible, Visible) => {
|
||||
// no change
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
|
||||
///
|
||||
/// However when we want something evicted, we cannot evict it right away as there might be current
|
||||
/// reads happening on it. For example: it has been searched from [`LayerMap::search`] but not yet
|
||||
/// read with [`Layer::get_value_reconstruct_data`].
|
||||
/// read with [`Layer::get_values_reconstruct_data`].
|
||||
///
|
||||
/// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
|
||||
#[derive(Debug)]
|
||||
@@ -580,9 +581,6 @@ struct LayerInner {
|
||||
/// Full path to the file; unclear if this should exist anymore.
|
||||
path: Utf8PathBuf,
|
||||
|
||||
/// String representation of the layer, used for traversal id.
|
||||
debug_str: Arc<str>,
|
||||
|
||||
desc: PersistentLayerDesc,
|
||||
|
||||
/// Timeline access is needed for remote timeline client and metrics.
|
||||
@@ -693,6 +691,16 @@ impl Drop for LayerInner {
|
||||
timeline.metrics.layer_count_image.dec();
|
||||
timeline.metrics.layer_size_image.sub(self.desc.file_size);
|
||||
}
|
||||
|
||||
if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
|
||||
debug_assert!(
|
||||
timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
|
||||
);
|
||||
timeline
|
||||
.metrics
|
||||
.visible_physical_size_gauge
|
||||
.sub(self.desc.file_size);
|
||||
}
|
||||
}
|
||||
|
||||
if !*self.wanted_deleted.get_mut() {
|
||||
@@ -801,11 +809,14 @@ impl LayerInner {
|
||||
timeline.metrics.layer_size_image.add(desc.file_size);
|
||||
}
|
||||
|
||||
// New layers are visible by default. This metric is later updated on drop or in set_visibility
|
||||
timeline
|
||||
.metrics
|
||||
.visible_physical_size_gauge
|
||||
.add(desc.file_size);
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
debug_str: {
|
||||
format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
|
||||
},
|
||||
path: local_path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
@@ -1726,28 +1737,6 @@ impl DownloadedLayer {
|
||||
.map_err(|e| anyhow::anyhow!("layer load failed earlier: {e}"))
|
||||
}
|
||||
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
owner: &Arc<LayerInner>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
use LayerKind::*;
|
||||
|
||||
match self.get(owner, ctx).await? {
|
||||
Delta(d) => {
|
||||
d.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
Image(i) => {
|
||||
i.get_value_reconstruct_data(key, reconstruct_data, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_values_reconstruct_data(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -1846,7 +1835,7 @@ impl ResidentLayer {
|
||||
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
|
||||
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
|
||||
// while it's being held.
|
||||
owner.access_stats.record_access(ctx);
|
||||
self.owner.record_access(ctx);
|
||||
|
||||
delta_layer::DeltaLayerInner::load_keys(d, ctx)
|
||||
.await
|
||||
|
||||
@@ -50,13 +50,26 @@ async fn smoke_test() {
|
||||
// all layers created at pageserver are like `layer`, initialized with strong
|
||||
// Arc<DownloadedLayer>.
|
||||
|
||||
let controlfile_keyspace = KeySpace {
|
||||
ranges: vec![CONTROLFILE_KEY..CONTROLFILE_KEY.next()],
|
||||
};
|
||||
|
||||
let img_before = {
|
||||
let mut data = ValueReconstructState::default();
|
||||
let mut data = ValuesReconstructState::default();
|
||||
layer
|
||||
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
|
||||
.get_values_reconstruct_data(
|
||||
controlfile_keyspace.clone(),
|
||||
Lsn(0x10)..Lsn(0x11),
|
||||
&mut data,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
data.img
|
||||
data.keys
|
||||
.remove(&CONTROLFILE_KEY)
|
||||
.expect("must be present")
|
||||
.expect("should not error")
|
||||
.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
};
|
||||
@@ -74,13 +87,24 @@ async fn smoke_test() {
|
||||
|
||||
// on accesses when the layer is evicted, it will automatically be downloaded.
|
||||
let img_after = {
|
||||
let mut data = ValueReconstructState::default();
|
||||
let mut data = ValuesReconstructState::default();
|
||||
layer
|
||||
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
|
||||
.get_values_reconstruct_data(
|
||||
controlfile_keyspace.clone(),
|
||||
Lsn(0x10)..Lsn(0x11),
|
||||
&mut data,
|
||||
&ctx,
|
||||
)
|
||||
.instrument(download_span.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
data.img.take().unwrap()
|
||||
data.keys
|
||||
.remove(&CONTROLFILE_KEY)
|
||||
.expect("must be present")
|
||||
.expect("should not error")
|
||||
.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
};
|
||||
|
||||
assert_eq!(img_before, img_after);
|
||||
@@ -830,7 +854,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
fn layer_size() {
|
||||
assert_eq!(size_of::<LayerAccessStats>(), 8);
|
||||
assert_eq!(size_of::<PersistentLayerDesc>(), 104);
|
||||
assert_eq!(size_of::<LayerInner>(), 312);
|
||||
assert_eq!(size_of::<LayerInner>(), 296);
|
||||
// it also has the utf8 path
|
||||
}
|
||||
|
||||
|
||||
@@ -41,6 +41,20 @@ pub struct PersistentLayerKey {
|
||||
pub is_delta: bool,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for PersistentLayerKey {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}..{} {}..{} is_delta={}",
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end,
|
||||
self.is_delta
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl PersistentLayerDesc {
|
||||
pub fn key(&self) -> PersistentLayerKey {
|
||||
PersistentLayerKey {
|
||||
|
||||
454
pageserver/src/tenant/storage_layer/split_writer.rs
Normal file
454
pageserver/src/tenant/storage_layer/split_writer.rs
Normal file
@@ -0,0 +1,454 @@
|
||||
use std::{ops::Range, sync::Arc};
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
|
||||
use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
|
||||
/// to be cleaned up)
|
||||
#[must_use]
|
||||
pub struct SplitImageLayerWriter {
|
||||
inner: ImageLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl SplitImageLayerWriter {
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_key: Key,
|
||||
lsn: Lsn,
|
||||
target_layer_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: ImageLayerWriter::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
&(start_key..Key::MAX),
|
||||
lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
generated_layers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_image(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is an upper bound of the space that the key/image could take
|
||||
// because we did not consider compression in this estimation. The resulting image layer
|
||||
// could be smaller than the target size.
|
||||
let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
let next_image_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(key..Key::MAX),
|
||||
self.lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
|
||||
self.generated_layers.push(
|
||||
prev_image_writer
|
||||
.finish_with_end_key(tline, key, ctx)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
self.inner.put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
|
||||
/// to be cleaned up).
|
||||
#[must_use]
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: DeltaLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl SplitDeltaLayerWriter {
|
||||
pub async fn new(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
start_key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
target_layer_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Self> {
|
||||
Ok(Self {
|
||||
target_layer_size,
|
||||
inner: DeltaLayerWriter::new(
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
start_key,
|
||||
lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
generated_layers: Vec::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
let next_delta_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers.push(delta_layer);
|
||||
}
|
||||
self.inner.put_value(key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(delta_layer);
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::AsLayerDesc,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
fn get_img(id: u32) -> Bytes {
|
||||
format!("{id:064}").into()
|
||||
}
|
||||
|
||||
fn get_large_img() -> Bytes {
|
||||
vec![0; 8192].into()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_one_image() {
|
||||
let harness = TenantHarness::create("split_writer_write_one_image")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split() {
|
||||
let harness = TenantHarness::create("split_writer_write_split")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
const N: usize = 2000;
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
image_writer
|
||||
.put_image(get_key(i), get_large_img(), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(i),
|
||||
Lsn(0x20),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
let image_layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
let delta_layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
image_layers[idx - 1].layer_desc().key_range.end,
|
||||
image_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].layer_desc().key_range.end,
|
||||
delta_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_large_img() {
|
||||
let harness = TenantHarness::create("split_writer_write_large_img")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut image_writer = SplitImageLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
4 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x18)..Lsn(0x20),
|
||||
4 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
image_writer
|
||||
.put_image(get_key(0), get_img(0), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
image_writer
|
||||
.put_image(get_key(1), get_large_img(), &tline, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = image_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(0x18),
|
||||
Value::Image(get_img(0)),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(1),
|
||||
Lsn(0x1A),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(10))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
}
|
||||
}
|
||||
@@ -407,9 +407,16 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
|
||||
error!(
|
||||
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
);
|
||||
if matches!(e, crate::tenant::GcError::TimelineCancelled) {
|
||||
// Timeline was cancelled during gc. We might either be in an event
|
||||
// that affects the entire tenant (tenant deletion, pageserver shutdown),
|
||||
// or in one that affects the timeline only (timeline deletion).
|
||||
// Therefore, don't exit the loop.
|
||||
info!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
} else {
|
||||
error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}");
|
||||
}
|
||||
|
||||
wait_duration
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ pub(crate) mod compaction;
|
||||
pub mod delete;
|
||||
pub(crate) mod detach_ancestor;
|
||||
mod eviction_task;
|
||||
pub(crate) mod handle;
|
||||
mod init;
|
||||
pub mod layer_manager;
|
||||
pub(crate) mod logical_size;
|
||||
@@ -17,11 +18,12 @@ use camino::Utf8Path;
|
||||
use chrono::{DateTime, Utc};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use handle::ShardTimelineId;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{
|
||||
AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
|
||||
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
|
||||
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
@@ -57,10 +59,7 @@ use std::{
|
||||
collections::{BTreeMap, HashMap, HashSet},
|
||||
sync::atomic::AtomicU64,
|
||||
};
|
||||
use std::{
|
||||
cmp::{max, min, Ordering},
|
||||
ops::ControlFlow,
|
||||
};
|
||||
use std::{cmp::min, cmp::Ordering, ops::ControlFlow};
|
||||
use std::{
|
||||
collections::btree_map::Entry,
|
||||
ops::{Deref, Range},
|
||||
@@ -74,6 +73,7 @@ use crate::{
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::PersistentLayerDesc,
|
||||
},
|
||||
walredo,
|
||||
};
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
@@ -84,8 +84,8 @@ use crate::{
|
||||
disk_usage_eviction_task::finite_f32,
|
||||
tenant::storage_layer::{
|
||||
AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
|
||||
LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
|
||||
ValueReconstructState, ValuesReconstructState,
|
||||
LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructState,
|
||||
ValuesReconstructState,
|
||||
},
|
||||
};
|
||||
use crate::{
|
||||
@@ -137,10 +137,13 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::{config::TenantConf, upload_queue::NotInitialized};
|
||||
use super::{config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
||||
use super::{
|
||||
remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
|
||||
storage_layer::ReadableLayer,
|
||||
};
|
||||
use super::{
|
||||
secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
|
||||
GcError,
|
||||
@@ -443,6 +446,8 @@ pub struct Timeline {
|
||||
pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
|
||||
|
||||
pub(crate) l0_flush_global_state: L0FlushGlobalState,
|
||||
|
||||
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -548,17 +553,21 @@ impl GetVectoredError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
shard: ShardNumber,
|
||||
cont_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
traversal_path: Vec<TraversalPathItem>,
|
||||
backtrace: Option<std::backtrace::Backtrace>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
@@ -570,18 +579,6 @@ impl std::fmt::Display for MissingKeyError {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
|
||||
if !self.traversal_path.is_empty() {
|
||||
writeln!(f)?;
|
||||
}
|
||||
|
||||
for (r, c, l) in &self.traversal_path {
|
||||
writeln!(
|
||||
f,
|
||||
"layer traversal: result {:?}, cont_lsn {}, layer: {}",
|
||||
r, c, l,
|
||||
)?;
|
||||
}
|
||||
|
||||
if let Some(ref backtrace) = self.backtrace {
|
||||
write!(f, "\n{}", backtrace)?;
|
||||
}
|
||||
@@ -710,6 +707,7 @@ pub(crate) enum CompactFlags {
|
||||
ForceRepartition,
|
||||
ForceImageLayerCreation,
|
||||
EnhancedGcBottomMostCompaction,
|
||||
DryRun,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Timeline {
|
||||
@@ -923,116 +921,44 @@ impl Timeline {
|
||||
|
||||
self.timeline_get_throttle.throttle(ctx, 1).await;
|
||||
|
||||
match self.conf.get_impl {
|
||||
GetImpl::Legacy => {
|
||||
let reconstruct_state = ValueReconstructState {
|
||||
records: Vec::new(),
|
||||
img: None,
|
||||
};
|
||||
let keyspace = KeySpace {
|
||||
ranges: vec![key..key.next()],
|
||||
};
|
||||
|
||||
self.get_impl(key, lsn, reconstruct_state, ctx).await
|
||||
}
|
||||
GetImpl::Vectored => {
|
||||
let keyspace = KeySpace {
|
||||
ranges: vec![key..key.next()],
|
||||
};
|
||||
// Initialise the reconstruct state for the key with the cache
|
||||
// entry returned above.
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
|
||||
// Initialise the reconstruct state for the key with the cache
|
||||
// entry returned above.
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await;
|
||||
|
||||
if self.conf.validate_vectored_get {
|
||||
self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
|
||||
.await;
|
||||
}
|
||||
|
||||
let key_value = vectored_res?.pop_first();
|
||||
match key_value {
|
||||
Some((got_key, value)) => {
|
||||
if got_key != key {
|
||||
error!(
|
||||
"Expected {}, but singular vectored get returned {}",
|
||||
key, got_key
|
||||
);
|
||||
Err(PageReconstructError::Other(anyhow!(
|
||||
"Singular vectored get returned wrong key"
|
||||
)))
|
||||
} else {
|
||||
value
|
||||
}
|
||||
}
|
||||
None => Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(0),
|
||||
request_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
traversal_path: Vec::new(),
|
||||
backtrace: None,
|
||||
})),
|
||||
let key_value = vectored_res?.pop_first();
|
||||
match key_value {
|
||||
Some((got_key, value)) => {
|
||||
if got_key != key {
|
||||
error!(
|
||||
"Expected {}, but singular vectored get returned {}",
|
||||
key, got_key
|
||||
);
|
||||
Err(PageReconstructError::Other(anyhow!(
|
||||
"Singular vectored get returned wrong key"
|
||||
)))
|
||||
} else {
|
||||
value
|
||||
}
|
||||
}
|
||||
None => Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(0),
|
||||
request_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
async fn get_impl(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
// XXX: structured stats collection for layer eviction here.
|
||||
trace!(
|
||||
"get page request for {}@{} from task kind {:?}",
|
||||
key,
|
||||
lsn,
|
||||
ctx.task_kind()
|
||||
);
|
||||
|
||||
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
|
||||
.for_get_kind(GetKind::Singular)
|
||||
.start_timer();
|
||||
let path = self
|
||||
.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
let start = Instant::now();
|
||||
let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
|
||||
let elapsed = start.elapsed();
|
||||
crate::metrics::RECONSTRUCT_TIME
|
||||
.for_get_kind(GetKind::Singular)
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if cfg!(feature = "testing") && res.is_err() {
|
||||
// it can only be walredo issue
|
||||
use std::fmt::Write;
|
||||
|
||||
let mut msg = String::new();
|
||||
|
||||
path.into_iter().for_each(|(res, cont_lsn, layer)| {
|
||||
writeln!(
|
||||
msg,
|
||||
"- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
|
||||
layer,
|
||||
)
|
||||
.expect("string grows")
|
||||
});
|
||||
|
||||
// this is to rule out or provide evidence that we could in some cases read a duplicate
|
||||
// walrecord
|
||||
tracing::info!("walredo failed, path:\n{msg}");
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
|
||||
pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
|
||||
|
||||
@@ -1082,28 +1008,14 @@ impl Timeline {
|
||||
.throttle(ctx, key_count as usize)
|
||||
.await;
|
||||
|
||||
let res = match self.conf.get_vectored_impl {
|
||||
GetVectoredImpl::Sequential => {
|
||||
self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
|
||||
}
|
||||
GetVectoredImpl::Vectored => {
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
if self.conf.validate_vectored_get {
|
||||
self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
|
||||
.await;
|
||||
}
|
||||
|
||||
vectored_res
|
||||
}
|
||||
};
|
||||
let res = self
|
||||
.get_vectored_impl(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
&mut ValuesReconstructState::new(),
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Some((metric, start)) = start {
|
||||
let elapsed = start.elapsed();
|
||||
@@ -1192,65 +1104,6 @@ impl Timeline {
|
||||
vectored_res
|
||||
}
|
||||
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
pub(super) async fn get_vectored_sequential_impl(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let mut values = BTreeMap::new();
|
||||
|
||||
for range in keyspace.ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
let block = self
|
||||
.get_impl(key, lsn, ValueReconstructState::default(), ctx)
|
||||
.await;
|
||||
|
||||
use PageReconstructError::*;
|
||||
match block {
|
||||
Err(Cancelled) => return Err(GetVectoredError::Cancelled),
|
||||
Err(MissingKey(_))
|
||||
if NON_INHERITED_RANGE.contains(&key)
|
||||
|| NON_INHERITED_SPARSE_RANGE.contains(&key) =>
|
||||
{
|
||||
// Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range.
|
||||
// When we add more types of keys into the page server, we should revisit this part of code and throw errors
|
||||
// accordingly.
|
||||
key = key.next();
|
||||
}
|
||||
Err(MissingKey(err)) => {
|
||||
return Err(GetVectoredError::MissingKey(err));
|
||||
}
|
||||
Err(Other(err))
|
||||
if err
|
||||
.to_string()
|
||||
.contains("downloading evicted layer file failed") =>
|
||||
{
|
||||
return Err(GetVectoredError::Other(err))
|
||||
}
|
||||
Err(Other(err))
|
||||
if err
|
||||
.chain()
|
||||
.any(|cause| cause.to_string().contains("layer loading failed")) =>
|
||||
{
|
||||
// The intent here is to achieve error parity with the vectored read path.
|
||||
// When vectored read fails to load a layer it fails the whole read, hence
|
||||
// we mimic this behaviour here to keep the validation happy.
|
||||
return Err(GetVectoredError::Other(err));
|
||||
}
|
||||
_ => {
|
||||
values.insert(key, block);
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(values)
|
||||
}
|
||||
|
||||
pub(super) async fn get_vectored_impl(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -1321,113 +1174,6 @@ impl Timeline {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
pub(super) async fn validate_get_vectored_impl(
|
||||
&self,
|
||||
vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
if keyspace.overlaps(&Key::metadata_key_range()) {
|
||||
// skip validation for metadata key range
|
||||
return;
|
||||
}
|
||||
|
||||
let sequential_res = self
|
||||
.get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
|
||||
.await;
|
||||
|
||||
fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
|
||||
use GetVectoredError::*;
|
||||
match (lhs, rhs) {
|
||||
(Oversized(l), Oversized(r)) => l == r,
|
||||
(InvalidLsn(l), InvalidLsn(r)) => l == r,
|
||||
(MissingKey(l), MissingKey(r)) => l.key == r.key,
|
||||
(GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
|
||||
(Other(_), Other(_)) => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
match (&sequential_res, vectored_res) {
|
||||
(Err(GetVectoredError::Cancelled), _) => {},
|
||||
(_, Err(GetVectoredError::Cancelled)) => {},
|
||||
(Err(seq_err), Ok(_)) => {
|
||||
panic!(concat!("Sequential get failed with {}, but vectored get did not",
|
||||
" - keyspace={:?} lsn={}"),
|
||||
seq_err, keyspace, lsn) },
|
||||
(Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => {
|
||||
// Sequential get runs after vectored get, so it is possible for the later
|
||||
// to time out while waiting for its ancestor's Lsn to become ready and for the
|
||||
// former to succeed (it essentially has a doubled wait time).
|
||||
},
|
||||
(Ok(_), Err(vec_err)) => {
|
||||
panic!(concat!("Vectored get failed with {}, but sequential get did not",
|
||||
" - keyspace={:?} lsn={}"),
|
||||
vec_err, keyspace, lsn) },
|
||||
(Err(seq_err), Err(vec_err)) => {
|
||||
assert!(errors_match(seq_err, vec_err),
|
||||
"Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
|
||||
(Ok(seq_values), Ok(vec_values)) => {
|
||||
seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
|
||||
assert_eq!(seq_key, vec_key);
|
||||
match (seq_res, vec_res) {
|
||||
(Ok(seq_blob), Ok(vec_blob)) => {
|
||||
Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob);
|
||||
},
|
||||
(Err(err), Ok(_)) => {
|
||||
panic!(
|
||||
concat!("Sequential get failed with {} for key {}, but vectored get did not",
|
||||
" - keyspace={:?} lsn={}"),
|
||||
err, seq_key, keyspace, lsn) },
|
||||
(Ok(_), Err(err)) => {
|
||||
panic!(
|
||||
concat!("Vectored get failed with {} for key {}, but sequential get did not",
|
||||
" - keyspace={:?} lsn={}"),
|
||||
err, seq_key, keyspace, lsn) },
|
||||
(Err(_), Err(_)) => {}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_key_equivalence(
|
||||
key: &Key,
|
||||
keyspace: &KeySpace,
|
||||
lsn: Lsn,
|
||||
seq: &Bytes,
|
||||
vec: &Bytes,
|
||||
) {
|
||||
if *key == AUX_FILES_KEY {
|
||||
// The value reconstruct of AUX_FILES_KEY from records is not deterministic
|
||||
// since it uses a hash map under the hood. Hence, deserialise both results
|
||||
// before comparing.
|
||||
let seq_aux_dir_res = AuxFilesDirectory::des(seq);
|
||||
let vec_aux_dir_res = AuxFilesDirectory::des(vec);
|
||||
match (&seq_aux_dir_res, &vec_aux_dir_res) {
|
||||
(Ok(seq_aux_dir), Ok(vec_aux_dir)) => {
|
||||
assert_eq!(
|
||||
seq_aux_dir, vec_aux_dir,
|
||||
"Mismatch for key {} - keyspace={:?} lsn={}",
|
||||
key, keyspace, lsn
|
||||
);
|
||||
}
|
||||
(Err(_), Err(_)) => {}
|
||||
_ => {
|
||||
panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// All other keys should reconstruct deterministically, so we simply compare the blobs.
|
||||
assert_eq!(
|
||||
seq, vec,
|
||||
"Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
pub(crate) fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().last
|
||||
@@ -1929,6 +1675,9 @@ impl Timeline {
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
self.cancel.cancel();
|
||||
|
||||
// Ensure Prevent new page service requests from starting.
|
||||
self.handles.shutdown();
|
||||
|
||||
// Transition the remote_client into a state where it's only useful for timeline deletion.
|
||||
// (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
|
||||
self.remote_client.stop();
|
||||
@@ -2454,6 +2203,8 @@ impl Timeline {
|
||||
extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
|
||||
|
||||
l0_flush_global_state: resources.l0_flush_global_state,
|
||||
|
||||
handles: Default::default(),
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
@@ -2737,6 +2488,10 @@ impl Timeline {
|
||||
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
||||
// on retry.
|
||||
|
||||
// Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
|
||||
drop(guard); // drop write lock, update_layer_visibility will take a read lock.
|
||||
self.update_layer_visibility().await;
|
||||
|
||||
info!(
|
||||
"loaded layer map with {} layers at {}, total physical size: {}",
|
||||
num_layers, disk_consistent_lsn, total_physical_size
|
||||
@@ -3183,14 +2938,22 @@ impl Timeline {
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
|
||||
let resident = guard.likely_resident_layers().map(|layer| {
|
||||
let last_activity_ts = layer.access_stats().latest_activity();
|
||||
|
||||
HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
)
|
||||
let resident = guard.likely_resident_layers().filter_map(|layer| {
|
||||
match layer.visibility() {
|
||||
LayerVisibilityHint::Visible => {
|
||||
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
Some(HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
))
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
|
||||
None
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let layers = resident.collect();
|
||||
@@ -3208,228 +2971,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalId = Arc<str>;
|
||||
|
||||
trait TraversalLayerExt {
|
||||
fn traversal_id(&self) -> TraversalId;
|
||||
}
|
||||
|
||||
impl TraversalLayerExt for Layer {
|
||||
fn traversal_id(&self) -> TraversalId {
|
||||
Arc::clone(self.debug_str())
|
||||
}
|
||||
}
|
||||
|
||||
impl TraversalLayerExt for Arc<InMemoryLayer> {
|
||||
fn traversal_id(&self) -> TraversalId {
|
||||
Arc::clone(self.local_path_str())
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
///
|
||||
/// Get a handle to a Layer for reading.
|
||||
///
|
||||
/// The returned Layer might be from an ancestor timeline, if the
|
||||
/// segment hasn't been updated on this timeline yet.
|
||||
///
|
||||
/// This function takes the current timeline's locked LayerMap as an argument,
|
||||
/// so callers can avoid potential race conditions.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
async fn get_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
|
||||
// Start from the current timeline.
|
||||
let mut timeline_owned;
|
||||
let mut timeline = self;
|
||||
|
||||
let mut read_count = scopeguard::guard(0, |cnt| {
|
||||
crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
|
||||
});
|
||||
|
||||
// For debugging purposes, collect the path of layers that we traversed
|
||||
// through. It's included in the error message if we fail to find the key.
|
||||
let mut traversal_path = Vec::<TraversalPathItem>::new();
|
||||
|
||||
let cached_lsn = if let Some((cached_lsn, _)) = &reconstruct_state.img {
|
||||
*cached_lsn
|
||||
} else {
|
||||
Lsn(0)
|
||||
};
|
||||
|
||||
// 'prev_lsn' tracks the last LSN that we were at in our search. It's used
|
||||
// to check that each iteration make some progress, to break infinite
|
||||
// looping if something goes wrong.
|
||||
let mut prev_lsn = None;
|
||||
|
||||
let mut result = ValueReconstructResult::Continue;
|
||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||
|
||||
'outer: loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(PageReconstructError::Cancelled);
|
||||
}
|
||||
|
||||
// The function should have updated 'state'
|
||||
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
|
||||
match result {
|
||||
ValueReconstructResult::Complete => return Ok(traversal_path),
|
||||
ValueReconstructResult::Continue => {
|
||||
// If we reached an earlier cached page image, we're done.
|
||||
if cont_lsn == cached_lsn + 1 {
|
||||
return Ok(traversal_path);
|
||||
}
|
||||
if let Some(prev) = prev_lsn {
|
||||
if prev <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
// getting stuck in the loop.
|
||||
return Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(cont_lsn.0 - 1),
|
||||
request_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
traversal_path,
|
||||
backtrace: None,
|
||||
}));
|
||||
}
|
||||
}
|
||||
prev_lsn = Some(cont_lsn);
|
||||
}
|
||||
ValueReconstructResult::Missing => {
|
||||
return Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: None,
|
||||
traversal_path,
|
||||
backtrace: if cfg!(test) {
|
||||
Some(std::backtrace::Backtrace::force_capture())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
// Recurse into ancestor if needed
|
||||
if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() {
|
||||
if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
|
||||
trace!(
|
||||
"going into ancestor {}, cont_lsn is {}",
|
||||
timeline.ancestor_lsn,
|
||||
cont_lsn
|
||||
);
|
||||
|
||||
timeline_owned = timeline
|
||||
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
|
||||
.await?;
|
||||
timeline = &*timeline_owned;
|
||||
prev_lsn = None;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
let guard = timeline.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
// Check the open and frozen in-memory layers first, in order from newest
|
||||
// to oldest.
|
||||
if let Some(open_layer) = &layers.open_layer {
|
||||
let start_lsn = open_layer.get_lsn_range().start;
|
||||
if cont_lsn > start_lsn {
|
||||
//info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
|
||||
let open_layer = open_layer.clone();
|
||||
drop(guard);
|
||||
|
||||
result = match open_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
*read_count += 1;
|
||||
traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
for frozen_layer in layers.frozen_layers.iter().rev() {
|
||||
let start_lsn = frozen_layer.get_lsn_range().start;
|
||||
if cont_lsn > start_lsn {
|
||||
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
|
||||
let frozen_layer = frozen_layer.clone();
|
||||
drop(guard);
|
||||
|
||||
result = match frozen_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
*read_count += 1;
|
||||
traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
|
||||
let layer = guard.get_from_desc(&layer);
|
||||
drop(guard);
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
result = match layer
|
||||
.get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state, ctx)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
*read_count += 1;
|
||||
traversal_path.push((result, cont_lsn, layer.traversal_id()));
|
||||
continue 'outer;
|
||||
} else if timeline.ancestor_timeline.is_some() {
|
||||
// Nothing on this timeline. Traverse to parent
|
||||
result = ValueReconstructResult::Continue;
|
||||
cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
|
||||
continue 'outer;
|
||||
} else {
|
||||
// Nothing found
|
||||
result = ValueReconstructResult::Missing;
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::doc_lazy_continuation)]
|
||||
/// Get the data needed to reconstruct all keys in the provided keyspace
|
||||
///
|
||||
@@ -3523,7 +3065,6 @@ impl Timeline {
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
traversal_path: vec![],
|
||||
backtrace: None,
|
||||
}));
|
||||
}
|
||||
@@ -3723,6 +3264,17 @@ impl Timeline {
|
||||
&self.shard_identity
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||
ShardTimelineId {
|
||||
shard_index: ShardIndex {
|
||||
shard_number: self.shard_identity.number,
|
||||
shard_count: self.shard_identity.count,
|
||||
},
|
||||
timeline_id: self.timeline_id,
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the latest layer for appending.
|
||||
///
|
||||
@@ -4075,6 +3627,21 @@ impl Timeline {
|
||||
// release lock on 'layers'
|
||||
};
|
||||
|
||||
// Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
|
||||
// This makes us refuse ingest until the new layers have been persisted to the remote.
|
||||
self.remote_client
|
||||
.wait_completion()
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
WaitCompletionError::UploadQueueShutDownOrStopped
|
||||
| WaitCompletionError::NotInitialized(
|
||||
NotInitialized::ShuttingDown | NotInitialized::Stopped,
|
||||
) => FlushLayerError::Cancelled,
|
||||
WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
|
||||
FlushLayerError::Other(anyhow!(e).into())
|
||||
}
|
||||
})?;
|
||||
|
||||
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||
// a compaction can delete the file and then it won't be available for uploads any more.
|
||||
// We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
|
||||
@@ -4090,17 +3657,11 @@ impl Timeline {
|
||||
|
||||
/// Return true if the value changed
|
||||
///
|
||||
/// This function must only be used from the layer flush task, and may not be called concurrently.
|
||||
/// This function must only be used from the layer flush task.
|
||||
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
|
||||
// We do a simple load/store cycle: that's why this function isn't safe for concurrent use.
|
||||
let old_value = self.disk_consistent_lsn.load();
|
||||
if new_value != old_value {
|
||||
assert!(new_value >= old_value);
|
||||
self.disk_consistent_lsn.store(new_value);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
let old_value = self.disk_consistent_lsn.fetch_max(new_value);
|
||||
assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
|
||||
new_value != old_value
|
||||
}
|
||||
|
||||
/// Update metadata file
|
||||
@@ -4167,12 +3728,14 @@ impl Timeline {
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
let ctx = ctx.attached_child();
|
||||
let work = async move {
|
||||
let Some(new_delta) = frozen_layer
|
||||
.write_to_disk(&self_clone, &ctx, key_range)
|
||||
let Some((desc, path)) = frozen_layer
|
||||
.write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
|
||||
.await?
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let new_delta = Layer::finish_creating(self_clone.conf, &self_clone, desc, &path)?;
|
||||
|
||||
// The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
|
||||
// We just need to fsync the directory in which these inodes are linked,
|
||||
// which we know to be the timeline directory.
|
||||
@@ -4667,27 +4230,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// The writer.finish() above already did the fsync of the inodes.
|
||||
// We just need to fsync the directory in which these inodes are linked,
|
||||
// which we know to be the timeline directory.
|
||||
if !image_layers.is_empty() {
|
||||
// We use fatal_err() below because the after writer.finish() returns with success,
|
||||
// the in-memory state of the filesystem already has the layer file in its final place,
|
||||
// and subsequent pageserver code could think it's durable while it really isn't.
|
||||
let timeline_dir = VirtualFile::open(
|
||||
&self
|
||||
.conf
|
||||
.timeline_path(&self.tenant_shard_id, &self.timeline_id),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.fatal_err("VirtualFile::open for timeline dir fsync");
|
||||
timeline_dir
|
||||
.sync_all()
|
||||
.await
|
||||
.fatal_err("VirtualFile::sync_all timeline dir");
|
||||
}
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
|
||||
// FIXME: we could add the images to be uploaded *before* returning from here, but right
|
||||
@@ -4696,6 +4238,9 @@ impl Timeline {
|
||||
drop_wlock(guard);
|
||||
timer.stop_and_record();
|
||||
|
||||
// Creating image layers may have caused some previously visible layers to be covered
|
||||
self.update_layer_visibility().await;
|
||||
|
||||
Ok(image_layers)
|
||||
}
|
||||
|
||||
@@ -4713,6 +4258,12 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
|
||||
if self.current_logical_size.current_size().is_exact() {
|
||||
// root timelines are initialized with exact count, but never start the background
|
||||
// calculation
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(await_bg_cancel) = self
|
||||
.current_logical_size
|
||||
.cancel_wait_for_background_loop_concurrency_limit_semaphore
|
||||
@@ -5460,20 +5011,22 @@ impl Timeline {
|
||||
} else {
|
||||
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
|
||||
};
|
||||
|
||||
let img = match self
|
||||
let res = self
|
||||
.walredo_mgr
|
||||
.as_ref()
|
||||
.context("timeline has no walredo manager")
|
||||
.map_err(PageReconstructError::WalRedo)?
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.await
|
||||
.context("reconstruct a page image")
|
||||
{
|
||||
.await;
|
||||
let img = match res {
|
||||
Ok(img) => img,
|
||||
Err(e) => return Err(PageReconstructError::WalRedo(e)),
|
||||
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||
Err(walredo::Error::Other(e)) => {
|
||||
return Err(PageReconstructError::WalRedo(
|
||||
e.context("reconstruct a page image"),
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
Ok(img)
|
||||
}
|
||||
}
|
||||
@@ -5658,7 +5211,7 @@ impl Timeline {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
|
||||
let last_activity_ts = layer.access_stats().latest_activity();
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
|
||||
EvictionCandidate {
|
||||
layer: layer.into(),
|
||||
@@ -5681,6 +5234,22 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Persistently blocks gc for `Manual` reason.
|
||||
///
|
||||
/// Returns true if no such block existed before, false otherwise.
|
||||
pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
|
||||
use crate::tenant::remote_timeline_client::index::GcBlockingReason;
|
||||
assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
|
||||
tenant.gc_block.insert(self, GcBlockingReason::Manual).await
|
||||
}
|
||||
|
||||
/// Persistently unblocks gc for `Manual` reason.
|
||||
pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
|
||||
use crate::tenant::remote_timeline_client::index::GcBlockingReason;
|
||||
assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
|
||||
tenant.gc_block.remove(self, GcBlockingReason::Manual).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(super) fn force_advance_lsn(self: &Arc<Timeline>, new_lsn: Lsn) {
|
||||
self.last_record_lsn.advance(new_lsn);
|
||||
@@ -5799,9 +5368,8 @@ impl Timeline {
|
||||
for (key, lsn, val) in deltas.data {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
let delta_layer = delta_layer_writer
|
||||
.finish(deltas.key_range.end, self, ctx)
|
||||
.await?;
|
||||
let (desc, path) = delta_layer_writer.finish(deltas.key_range.end, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
@@ -5862,8 +5430,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
|
||||
|
||||
/// Tracking writes ingestion does to a particular in-memory layer.
|
||||
///
|
||||
/// Cleared upon freezing a layer.
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//!
|
||||
//! The old legacy algorithm is implemented directly in `timeline.rs`.
|
||||
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::{BinaryHeap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -15,11 +15,14 @@ use super::{
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::KEY_SIZE;
|
||||
use pageserver_api::keyspace::ShardedRange;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::id::TimelineId;
|
||||
@@ -29,7 +32,9 @@ use crate::page_cache;
|
||||
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
|
||||
};
|
||||
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
@@ -38,6 +43,7 @@ use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -69,37 +75,130 @@ impl KeyHistoryRetention {
|
||||
self,
|
||||
key: Key,
|
||||
delta_writer: &mut Vec<(Key, Lsn, Value)>,
|
||||
image_writer: &mut ImageLayerWriter,
|
||||
mut image_writer: Option<&mut ImageLayerWriter>,
|
||||
stat: &mut CompactionStatistics,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut first_batch = true;
|
||||
for (_, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
if first_batch {
|
||||
if logs.len() == 1 && logs[0].1.is_image() {
|
||||
let Value::Image(img) = &logs[0].1 else {
|
||||
unreachable!()
|
||||
};
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
stat.produce_image_key(img);
|
||||
if let Some(image_writer) = image_writer.as_mut() {
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
} else {
|
||||
delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
|
||||
}
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
first_batch = false;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
}
|
||||
}
|
||||
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
|
||||
for (lsn, val) in above_horizon_logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
struct CompactionStatisticsNumSize {
|
||||
num: u64,
|
||||
size: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
pub struct CompactionStatistics {
|
||||
delta_layer_visited: CompactionStatisticsNumSize,
|
||||
image_layer_visited: CompactionStatisticsNumSize,
|
||||
delta_layer_produced: CompactionStatisticsNumSize,
|
||||
image_layer_produced: CompactionStatisticsNumSize,
|
||||
num_delta_layer_discarded: usize,
|
||||
num_image_layer_discarded: usize,
|
||||
num_unique_keys_visited: usize,
|
||||
wal_keys_visited: CompactionStatisticsNumSize,
|
||||
image_keys_visited: CompactionStatisticsNumSize,
|
||||
wal_produced: CompactionStatisticsNumSize,
|
||||
image_produced: CompactionStatisticsNumSize,
|
||||
}
|
||||
|
||||
impl CompactionStatistics {
|
||||
fn estimated_size_of_value(val: &Value) -> usize {
|
||||
match val {
|
||||
Value::Image(img) => img.len(),
|
||||
Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
|
||||
_ => std::mem::size_of::<NeonWalRecord>(),
|
||||
}
|
||||
}
|
||||
fn estimated_size_of_key() -> usize {
|
||||
KEY_SIZE // TODO: distinguish image layer and delta layer (count LSN in delta layer)
|
||||
}
|
||||
fn visit_delta_layer(&mut self, size: u64) {
|
||||
self.delta_layer_visited.num += 1;
|
||||
self.delta_layer_visited.size += size;
|
||||
}
|
||||
fn visit_image_layer(&mut self, size: u64) {
|
||||
self.image_layer_visited.num += 1;
|
||||
self.image_layer_visited.size += size;
|
||||
}
|
||||
fn on_unique_key_visited(&mut self) {
|
||||
self.num_unique_keys_visited += 1;
|
||||
}
|
||||
fn visit_wal_key(&mut self, val: &Value) {
|
||||
self.wal_keys_visited.num += 1;
|
||||
self.wal_keys_visited.size +=
|
||||
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn visit_image_key(&mut self, val: &Value) {
|
||||
self.image_keys_visited.num += 1;
|
||||
self.image_keys_visited.size +=
|
||||
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn produce_key(&mut self, val: &Value) {
|
||||
match val {
|
||||
Value::Image(img) => self.produce_image_key(img),
|
||||
Value::WalRecord(_) => self.produce_wal_key(val),
|
||||
}
|
||||
}
|
||||
fn produce_wal_key(&mut self, val: &Value) {
|
||||
self.wal_produced.num += 1;
|
||||
self.wal_produced.size +=
|
||||
Self::estimated_size_of_value(val) as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn produce_image_key(&mut self, val: &Bytes) {
|
||||
self.image_produced.num += 1;
|
||||
self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn discard_delta_layer(&mut self) {
|
||||
self.num_delta_layer_discarded += 1;
|
||||
}
|
||||
fn discard_image_layer(&mut self) {
|
||||
self.num_image_layer_discarded += 1;
|
||||
}
|
||||
fn produce_delta_layer(&mut self, size: u64) {
|
||||
self.delta_layer_produced.num += 1;
|
||||
self.delta_layer_produced.size += size;
|
||||
}
|
||||
fn produce_image_layer(&mut self, size: u64) {
|
||||
self.image_layer_produced.num += 1;
|
||||
self.image_layer_produced.size += size;
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// TODO: cancellation
|
||||
///
|
||||
@@ -111,12 +210,18 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, CompactionError> {
|
||||
if flags.contains(CompactFlags::EnhancedGcBottomMostCompaction) {
|
||||
self.compact_with_gc(cancel, ctx)
|
||||
self.compact_with_gc(cancel, flags, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
if flags.contains(CompactFlags::DryRun) {
|
||||
return Err(CompactionError::Other(anyhow!(
|
||||
"dry-run mode is not supported for legacy compaction for now"
|
||||
)));
|
||||
}
|
||||
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
// 1. First, calculate the desired "partitioning" of the
|
||||
@@ -438,6 +543,45 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
|
||||
/// an image layer between them and the most recent readable LSN (branch point or tip of timeline). The
|
||||
/// purpose of the visibility hint is to record which layers need to be available to service reads.
|
||||
///
|
||||
/// The result may be used as an input to eviction and secondary downloads to de-prioritize layers
|
||||
/// that we know won't be needed for reads.
|
||||
pub(super) async fn update_layer_visibility(&self) {
|
||||
let head_lsn = self.get_last_record_lsn();
|
||||
|
||||
// We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas
|
||||
// are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
|
||||
// Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
|
||||
// they will be subject to L0->L1 compaction in the near future.
|
||||
let layer_manager = self.layers.read().await;
|
||||
let layer_map = layer_manager.layer_map();
|
||||
|
||||
let readable_points = {
|
||||
let children = self.gc_info.read().unwrap().retain_lsns.clone();
|
||||
|
||||
let mut readable_points = Vec::with_capacity(children.len() + 1);
|
||||
for (child_lsn, _child_timeline_id) in &children {
|
||||
readable_points.push(*child_lsn);
|
||||
}
|
||||
readable_points.push(head_lsn);
|
||||
readable_points
|
||||
};
|
||||
|
||||
let (layer_visibility, covered) = layer_map.get_visibility(readable_points);
|
||||
for (layer_desc, visibility) in layer_visibility {
|
||||
// FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
|
||||
let layer = layer_manager.get_from_desc(&layer_desc);
|
||||
layer.set_visibility(visibility);
|
||||
}
|
||||
|
||||
// TODO: publish our covered KeySpace to our parent, so that when they update their visibility, they can
|
||||
// avoid assuming that everything at a branch point is visible.
|
||||
drop(covered);
|
||||
}
|
||||
|
||||
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
|
||||
/// as Level 1 files. Returns whether the L0 layers are fully compacted.
|
||||
async fn compact_level0(
|
||||
@@ -795,14 +939,16 @@ impl Timeline {
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(
|
||||
writer
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(prev_key.unwrap().next(), self, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
let (desc, path) = writer
|
||||
.take()
|
||||
.unwrap()
|
||||
.finish(prev_key.unwrap().next(), ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
new_layers.push(new_delta);
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
@@ -865,12 +1011,13 @@ impl Timeline {
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(
|
||||
writer
|
||||
.finish(prev_key.unwrap().next(), self, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?,
|
||||
);
|
||||
let (desc, path) = writer
|
||||
.finish(prev_key.unwrap().next(), ctx)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
let new_delta = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.map_err(CompactionError::Other)?;
|
||||
new_layers.push(new_delta);
|
||||
}
|
||||
|
||||
// Sync layers
|
||||
@@ -1118,21 +1265,22 @@ impl Timeline {
|
||||
pub(crate) async fn generate_key_retention(
|
||||
self: &Arc<Timeline>,
|
||||
key: Key,
|
||||
history: &[(Key, Lsn, Value)],
|
||||
full_history: &[(Key, Lsn, Value)],
|
||||
horizon: Lsn,
|
||||
retain_lsn_below_horizon: &[Lsn],
|
||||
delta_threshold_cnt: usize,
|
||||
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
|
||||
) -> anyhow::Result<KeyHistoryRetention> {
|
||||
// Pre-checks for the invariants
|
||||
if cfg!(debug_assertions) {
|
||||
for (log_key, _, _) in history {
|
||||
for (log_key, _, _) in full_history {
|
||||
assert_eq!(log_key, &key, "mismatched key");
|
||||
}
|
||||
for i in 1..history.len() {
|
||||
assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
|
||||
if history[i - 1].1 == history[i].1 {
|
||||
for i in 1..full_history.len() {
|
||||
assert!(full_history[i - 1].1 <= full_history[i].1, "unordered LSN");
|
||||
if full_history[i - 1].1 == full_history[i].1 {
|
||||
assert!(
|
||||
matches!(history[i - 1].2, Value::Image(_)),
|
||||
matches!(full_history[i - 1].2, Value::Image(_)),
|
||||
"unordered delta/image, or duplicated delta"
|
||||
);
|
||||
}
|
||||
@@ -1151,6 +1299,7 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
}
|
||||
let has_ancestor = base_img_from_ancestor.is_some();
|
||||
// Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
|
||||
// and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
|
||||
let (mut split_history, lsn_split_points) = {
|
||||
@@ -1162,7 +1311,7 @@ impl Timeline {
|
||||
}
|
||||
lsn_split_points.push(horizon);
|
||||
let mut current_idx = 0;
|
||||
for item @ (_, lsn, _) in history {
|
||||
for item @ (_, lsn, _) in full_history {
|
||||
while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
|
||||
current_idx += 1;
|
||||
}
|
||||
@@ -1184,6 +1333,9 @@ impl Timeline {
|
||||
// For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
|
||||
// keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
|
||||
// dropped.
|
||||
//
|
||||
// TODO: in case we have both delta + images for a given LSN and it does not exceed the delta
|
||||
// threshold, we could have kept delta instead to save space. This is an optimization for the future.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -1201,9 +1353,75 @@ impl Timeline {
|
||||
"should have at least below + above horizon batches"
|
||||
);
|
||||
let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
|
||||
if let Some((key, lsn, img)) = base_img_from_ancestor {
|
||||
replay_history.push((key, lsn, Value::Image(img)));
|
||||
}
|
||||
|
||||
/// Generate debug information for the replay history
|
||||
fn generate_history_trace(replay_history: &[(Key, Lsn, Value)]) -> String {
|
||||
use std::fmt::Write;
|
||||
let mut output = String::new();
|
||||
if let Some((key, _, _)) = replay_history.first() {
|
||||
write!(output, "key={} ", key).unwrap();
|
||||
let mut cnt = 0;
|
||||
for (_, lsn, val) in replay_history {
|
||||
if val.is_image() {
|
||||
write!(output, "i@{} ", lsn).unwrap();
|
||||
} else if val.will_init() {
|
||||
write!(output, "di@{} ", lsn).unwrap();
|
||||
} else {
|
||||
write!(output, "d@{} ", lsn).unwrap();
|
||||
}
|
||||
cnt += 1;
|
||||
if cnt >= 128 {
|
||||
write!(output, "... and more").unwrap();
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
write!(output, "<no history>").unwrap();
|
||||
}
|
||||
output
|
||||
}
|
||||
|
||||
fn generate_debug_trace(
|
||||
replay_history: Option<&[(Key, Lsn, Value)]>,
|
||||
full_history: &[(Key, Lsn, Value)],
|
||||
lsns: &[Lsn],
|
||||
horizon: Lsn,
|
||||
) -> String {
|
||||
use std::fmt::Write;
|
||||
let mut output = String::new();
|
||||
if let Some(replay_history) = replay_history {
|
||||
writeln!(
|
||||
output,
|
||||
"replay_history: {}",
|
||||
generate_history_trace(replay_history)
|
||||
)
|
||||
.unwrap();
|
||||
} else {
|
||||
writeln!(output, "replay_history: <disabled>",).unwrap();
|
||||
}
|
||||
writeln!(
|
||||
output,
|
||||
"full_history: {}",
|
||||
generate_history_trace(full_history)
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(
|
||||
output,
|
||||
"when processing: [{}] horizon={}",
|
||||
lsns.iter().map(|l| format!("{l}")).join(","),
|
||||
horizon
|
||||
)
|
||||
.unwrap();
|
||||
output
|
||||
}
|
||||
|
||||
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
|
||||
// TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
|
||||
records_since_last_image += split_for_lsn.len();
|
||||
let generate_image = if i == 0 {
|
||||
let generate_image = if i == 0 && !has_ancestor {
|
||||
// We always generate images for the first batch (below horizon / lowest retain_lsn)
|
||||
true
|
||||
} else if i == batch_cnt - 1 {
|
||||
@@ -1224,10 +1442,27 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
if let Some((_, _, val)) = replay_history.first() {
|
||||
assert!(val.will_init(), "invalid history, no base image");
|
||||
if !val.will_init() {
|
||||
return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
|
||||
|| {
|
||||
generate_debug_trace(
|
||||
Some(&replay_history),
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
if generate_image && records_since_last_image > 0 {
|
||||
records_since_last_image = 0;
|
||||
let replay_history_for_debug = if cfg!(debug_assertions) {
|
||||
Some(replay_history.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
|
||||
let history = std::mem::take(&mut replay_history);
|
||||
let mut img = None;
|
||||
let mut records = Vec::with_capacity(history.len());
|
||||
@@ -1235,14 +1470,30 @@ impl Timeline {
|
||||
img = Some((*lsn, val.clone()));
|
||||
for (_, lsn, val) in history.into_iter().skip(1) {
|
||||
let Value::WalRecord(rec) = val else {
|
||||
panic!("invalid record")
|
||||
return Err(anyhow::anyhow!(
|
||||
"invalid record, first record is image, expect walrecords"
|
||||
))
|
||||
.with_context(|| {
|
||||
generate_debug_trace(
|
||||
replay_history_for_debug_ref,
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
)
|
||||
});
|
||||
};
|
||||
records.push((lsn, rec));
|
||||
}
|
||||
} else {
|
||||
for (_, lsn, val) in history.into_iter() {
|
||||
let Value::WalRecord(rec) = val else {
|
||||
panic!("invalid record")
|
||||
return Err(anyhow::anyhow!("invalid record, first record is walrecord, expect rest are walrecord"))
|
||||
.with_context(|| generate_debug_trace(
|
||||
replay_history_for_debug_ref,
|
||||
full_history,
|
||||
retain_lsn_below_horizon,
|
||||
horizon,
|
||||
));
|
||||
};
|
||||
records.push((lsn, rec));
|
||||
}
|
||||
@@ -1254,12 +1505,11 @@ impl Timeline {
|
||||
replay_history.push((key, request_lsn, Value::Image(img.clone())));
|
||||
retention.push(vec![(request_lsn, Value::Image(img))]);
|
||||
} else {
|
||||
retention.push(
|
||||
split_for_lsn
|
||||
.iter()
|
||||
.map(|(_, lsn, value)| (*lsn, value.clone()))
|
||||
.collect(),
|
||||
);
|
||||
let deltas = split_for_lsn
|
||||
.iter()
|
||||
.map(|(_, lsn, value)| (*lsn, value.clone()))
|
||||
.collect_vec();
|
||||
retention.push(deltas);
|
||||
}
|
||||
}
|
||||
let mut result = Vec::with_capacity(retention.len());
|
||||
@@ -1274,7 +1524,7 @@ impl Timeline {
|
||||
result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
|
||||
}
|
||||
}
|
||||
unreachable!()
|
||||
unreachable!("key retention is empty")
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
@@ -1285,17 +1535,41 @@ impl Timeline {
|
||||
/// and create delta layers with all deltas >= gc horizon.
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
_cancel: &CancellationToken,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
info!("running enhanced gc bottom-most compaction");
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
|
||||
let gc_lock = async {
|
||||
tokio::select! {
|
||||
guard = self.gc_lock.lock() => Ok(guard),
|
||||
// TODO: refactor to CompactionError to correctly pass cancelled error
|
||||
_ = cancel.cancelled() => Err(anyhow!("cancelled")),
|
||||
}
|
||||
};
|
||||
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let dry_run = flags.contains(CompactFlags::DryRun);
|
||||
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
|
||||
|
||||
scopeguard::defer! {
|
||||
info!("done enhanced gc bottom-most compaction");
|
||||
};
|
||||
|
||||
let mut stat = CompactionStatistics::default();
|
||||
|
||||
// Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
|
||||
// The layer selection has the following properties:
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
@@ -1326,20 +1600,25 @@ impl Timeline {
|
||||
retain_lsns_below_horizon.sort();
|
||||
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
|
||||
};
|
||||
let lowest_retain_lsn = retain_lsns_below_horizon
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
assert_eq!(
|
||||
lowest_retain_lsn,
|
||||
retain_lsns_below_horizon
|
||||
.iter()
|
||||
.min()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff)
|
||||
);
|
||||
}
|
||||
let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
|
||||
Lsn(self.ancestor_lsn.0 + 1)
|
||||
} else {
|
||||
let res = retain_lsns_below_horizon
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
assert_eq!(
|
||||
res,
|
||||
retain_lsns_below_horizon
|
||||
.iter()
|
||||
.min()
|
||||
.copied()
|
||||
.unwrap_or(gc_cutoff)
|
||||
);
|
||||
}
|
||||
res
|
||||
};
|
||||
info!(
|
||||
"picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
|
||||
layer_selection.len(),
|
||||
@@ -1361,6 +1640,9 @@ impl Timeline {
|
||||
let key_range = desc.get_key_range();
|
||||
delta_split_points.insert(key_range.start);
|
||||
delta_split_points.insert(key_range.end);
|
||||
stat.visit_delta_layer(desc.file_size());
|
||||
} else {
|
||||
stat.visit_image_layer(desc.file_size());
|
||||
}
|
||||
}
|
||||
let mut delta_layers = Vec::new();
|
||||
@@ -1380,6 +1662,14 @@ impl Timeline {
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key: Option<Key> = None;
|
||||
|
||||
enum FlushDeltaResult {
|
||||
/// Create a new resident layer
|
||||
CreateResidentLayer(ResidentLayer),
|
||||
/// Keep an original delta layer
|
||||
KeepLayer(PersistentLayerKey),
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn flush_deltas(
|
||||
deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
|
||||
last_key: Key,
|
||||
@@ -1388,7 +1678,10 @@ impl Timeline {
|
||||
tline: &Arc<Timeline>,
|
||||
lowest_retain_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<ResidentLayer>> {
|
||||
stats: &mut CompactionStatistics,
|
||||
dry_run: bool,
|
||||
last_batch: bool,
|
||||
) -> anyhow::Result<Option<FlushDeltaResult>> {
|
||||
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
|
||||
// overlapping layers.
|
||||
//
|
||||
@@ -1408,46 +1701,176 @@ impl Timeline {
|
||||
*current_delta_split_point += 1;
|
||||
need_split = true;
|
||||
}
|
||||
if !need_split {
|
||||
if !need_split && !last_batch {
|
||||
return Ok(None);
|
||||
}
|
||||
let deltas = std::mem::take(deltas);
|
||||
let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
|
||||
if deltas.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
|
||||
let delta_key = PersistentLayerKey {
|
||||
key_range: {
|
||||
let key_start = deltas.first().unwrap().0;
|
||||
let key_end = deltas.last().unwrap().0.next();
|
||||
key_start..key_end
|
||||
},
|
||||
lsn_range: lowest_retain_lsn..end_lsn,
|
||||
is_delta: true,
|
||||
};
|
||||
{
|
||||
// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
|
||||
//
|
||||
// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
|
||||
// For example, consider the case where a single delta with range [0x10,0x50) exists.
|
||||
// And we have branches at LSN 0x10, 0x20, 0x30.
|
||||
// Then we delete branch @ 0x20.
|
||||
// Bottom-most compaction may now delete the delta [0x20,0x30).
|
||||
// And that wouldnt' change the shape of the layer.
|
||||
//
|
||||
// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
|
||||
// That's why it's safe to skip.
|
||||
let guard = tline.layers.read().await;
|
||||
|
||||
if guard.contains_key(&delta_key) {
|
||||
let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == tline.generation {
|
||||
stats.discard_delta_layer();
|
||||
// TODO: depending on whether we design this compaction process to run along with
|
||||
// other compactions, there could be layer map modifications after we drop the
|
||||
// layer guard, and in case it creates duplicated layer key, we will still error
|
||||
// in the end.
|
||||
info!(
|
||||
key=%delta_key,
|
||||
?layer_generation,
|
||||
"discard delta layer due to duplicated layer in the same generation"
|
||||
);
|
||||
return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
tline.conf,
|
||||
tline.timeline_id,
|
||||
tline.tenant_shard_id,
|
||||
deltas.first().unwrap().0,
|
||||
delta_key.key_range.start,
|
||||
lowest_retain_lsn..end_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let key_end = deltas.last().unwrap().0.next();
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
let delta_layer = delta_layer_writer.finish(key_end, tline, ctx).await?;
|
||||
Ok(Some(delta_layer))
|
||||
|
||||
stats.produce_delta_layer(delta_layer_writer.size());
|
||||
if dry_run {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let (desc, path) = delta_layer_writer
|
||||
.finish(delta_key.key_range.end, ctx)
|
||||
.await?;
|
||||
let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
|
||||
Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
|
||||
}
|
||||
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&(Key::MIN..Key::MAX), // covers the full key range
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// Hack the key range to be min..(max-1). Otherwise, the image layer will be
|
||||
// interpreted as an L0 delta layer.
|
||||
let hack_image_layer_range = {
|
||||
let mut end_key = Key::MAX;
|
||||
end_key.field6 -= 1;
|
||||
Key::MIN..end_key
|
||||
};
|
||||
|
||||
// Only create image layers when there is no ancestor branches. TODO: create covering image layer
|
||||
// when some condition meet.
|
||||
let mut image_layer_writer = if self.ancestor_timeline.is_none() {
|
||||
Some(
|
||||
ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&hack_image_layer_range, // covers the full key range
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
/// Returns None if there is no ancestor branch. Throw an error when the key is not found.
|
||||
///
|
||||
/// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
|
||||
/// is needed for reconstruction. This should be fixed in the future.
|
||||
///
|
||||
/// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
|
||||
/// images.
|
||||
async fn get_ancestor_image(
|
||||
tline: &Arc<Timeline>,
|
||||
key: Key,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
|
||||
if tline.ancestor_timeline.is_none() {
|
||||
return Ok(None);
|
||||
};
|
||||
// This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
|
||||
// as much existing code as possible.
|
||||
let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
|
||||
Ok(Some((key, tline.ancestor_lsn, img)))
|
||||
}
|
||||
let image_layer_key = PersistentLayerKey {
|
||||
key_range: hack_image_layer_range,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
|
||||
// Like with delta layers, it can happen that we re-produce an already existing image layer.
|
||||
// This could happen when a user triggers force compaction and image generation. In this case,
|
||||
// it's always safe to rewrite the layer.
|
||||
let discard_image_layer = {
|
||||
let guard = self.layers.read().await;
|
||||
if guard.contains_key(&image_layer_key) {
|
||||
let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == self.generation {
|
||||
// TODO: depending on whether we design this compaction process to run along with
|
||||
// other compactions, there could be layer map modifications after we drop the
|
||||
// layer guard, and in case it creates duplicated layer key, we will still error
|
||||
// in the end.
|
||||
info!(
|
||||
key=%image_layer_key,
|
||||
?layer_generation,
|
||||
"discard image layer due to duplicated layer key in the same generation",
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
// Actually, we can decide not to write to the image layer at all at this point because
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
let mut delta_values = Vec::new();
|
||||
let delta_split_points = delta_split_points.into_iter().collect_vec();
|
||||
let mut current_delta_split_point = 0;
|
||||
let mut delta_layers = Vec::new();
|
||||
while let Some((key, lsn, val)) = merge_iter.next().await? {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
|
||||
}
|
||||
match val {
|
||||
Value::Image(_) => stat.visit_image_key(&val),
|
||||
Value::WalRecord(_) => stat.visit_wal_key(&val),
|
||||
}
|
||||
if last_key.is_none() || last_key.as_ref() == Some(&key) {
|
||||
if last_key.is_none() {
|
||||
last_key = Some(key);
|
||||
@@ -1455,6 +1878,7 @@ impl Timeline {
|
||||
accumulated_values.push((key, lsn, val));
|
||||
} else {
|
||||
let last_key = last_key.as_mut().unwrap();
|
||||
stat.on_unique_key_visited();
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
*last_key,
|
||||
@@ -1462,11 +1886,18 @@ impl Timeline {
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, *last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
&mut delta_values,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
@@ -1477,6 +1908,9 @@ impl Timeline {
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
&mut stat,
|
||||
dry_run,
|
||||
false,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
@@ -1488,6 +1922,7 @@ impl Timeline {
|
||||
|
||||
let last_key = last_key.expect("no keys produced during compaction");
|
||||
// TODO: move this part to the loop body
|
||||
stat.on_unique_key_visited();
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
last_key,
|
||||
@@ -1495,11 +1930,18 @@ impl Timeline {
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
|
||||
.pipe_to(
|
||||
last_key,
|
||||
&mut delta_values,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
@@ -1510,27 +1952,71 @@ impl Timeline {
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
&mut stat,
|
||||
dry_run,
|
||||
true,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
assert!(delta_values.is_empty(), "unprocessed keys");
|
||||
|
||||
let image_layer = if discard_image_layer {
|
||||
stat.discard_image_layer();
|
||||
None
|
||||
} else if let Some(writer) = image_layer_writer {
|
||||
stat.produce_image_layer(writer.size());
|
||||
if !dry_run {
|
||||
Some(writer.finish(self, ctx).await?)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
info!(
|
||||
"gc-compaction statistics: {}",
|
||||
serde_json::to_string(&stat)?
|
||||
);
|
||||
|
||||
if dry_run {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let image_layer = image_layer_writer.finish(self, ctx).await?;
|
||||
info!(
|
||||
"produced {} delta layers and {} image layers",
|
||||
delta_layers.len(),
|
||||
1
|
||||
if image_layer.is_some() { 1 } else { 0 }
|
||||
);
|
||||
let mut compact_to = Vec::new();
|
||||
compact_to.extend(delta_layers);
|
||||
compact_to.push(image_layer);
|
||||
let mut keep_layers = HashSet::new();
|
||||
for action in delta_layers {
|
||||
match action {
|
||||
FlushDeltaResult::CreateResidentLayer(layer) => {
|
||||
compact_to.push(layer);
|
||||
}
|
||||
FlushDeltaResult::KeepLayer(l) => {
|
||||
keep_layers.insert(l);
|
||||
}
|
||||
}
|
||||
}
|
||||
if discard_image_layer {
|
||||
keep_layers.insert(image_layer_key);
|
||||
}
|
||||
let mut layer_selection = layer_selection;
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
compact_to.extend(image_layer);
|
||||
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.finish_gc_compaction(&layer_selection, &compact_to, &self.metrics)
|
||||
};
|
||||
|
||||
self.remote_client
|
||||
.schedule_compaction_update(&layer_selection, &compact_to)?;
|
||||
|
||||
drop(gc_lock);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1727,9 +2213,9 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
))
|
||||
});
|
||||
|
||||
let new_delta_layer = writer
|
||||
.finish(prev.unwrap().0.next(), &self.timeline, ctx)
|
||||
.await?;
|
||||
let (desc, path) = writer.finish(prev.unwrap().0.next(), ctx).await?;
|
||||
let new_delta_layer =
|
||||
Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
|
||||
|
||||
self.new_deltas.push(new_delta_layer);
|
||||
Ok(())
|
||||
|
||||
@@ -63,10 +63,19 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline: &Timeline,
|
||||
) -> anyhow::Result<()> {
|
||||
let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
|
||||
let guards = crate::timed(
|
||||
guards,
|
||||
"acquire gc and compaction locks",
|
||||
// Always ensure the lock order is compaction -> gc.
|
||||
let compaction_lock = timeline.compaction_lock.lock();
|
||||
let compaction_lock = crate::timed(
|
||||
compaction_lock,
|
||||
"acquires compaction lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
|
||||
let gc_lock = timeline.gc_lock.lock();
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
)
|
||||
.await;
|
||||
@@ -107,7 +116,8 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
|
||||
info!("finished deleting layer files, releasing locks");
|
||||
drop(guards);
|
||||
drop(gc_lock);
|
||||
drop(compaction_lock);
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
@@ -206,11 +216,10 @@ impl DeleteTimelineFlow {
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
#[instrument(skip_all, fields(%inplace))]
|
||||
#[instrument(skip_all)]
|
||||
pub async fn run(
|
||||
tenant: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
inplace: bool,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
@@ -221,6 +230,8 @@ impl DeleteTimelineFlow {
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
tenant.gc_block.before_delete(&timeline);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-deleted-at"
|
||||
@@ -235,11 +246,7 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
if inplace {
|
||||
Self::background(guard, tenant.conf, tenant, &timeline).await?
|
||||
} else {
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
}
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -488,10 +488,12 @@ async fn copy_lsn_prefix(
|
||||
// reuse the key instead of adding more holes between layers by using the real
|
||||
// highest key in the layer.
|
||||
let reused_highest_key = layer.layer_desc().key_range.end;
|
||||
let copied = writer
|
||||
.finish(reused_highest_key, target_timeline, ctx)
|
||||
let (desc, path) = writer
|
||||
.finish(reused_highest_key, ctx)
|
||||
.await
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
let copied = Layer::finish_creating(target_timeline.conf, target_timeline, desc, &path)
|
||||
.map_err(CopyDeltaPrefix)?;
|
||||
|
||||
tracing::debug!(%layer, %copied, "new layer produced");
|
||||
|
||||
|
||||
@@ -225,7 +225,7 @@ impl Timeline {
|
||||
continue;
|
||||
}
|
||||
|
||||
let last_activity_ts = layer.access_stats().latest_activity();
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
|
||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||
Ok(d) => d,
|
||||
|
||||
967
pageserver/src/tenant/timeline/handle.rs
Normal file
967
pageserver/src/tenant/timeline/handle.rs
Normal file
@@ -0,0 +1,967 @@
|
||||
//! An efficient way to keep the timeline gate open without preventing
|
||||
//! timeline shutdown for longer than a single call to a timeline method.
|
||||
//!
|
||||
//! # Motivation
|
||||
//!
|
||||
//! On a single page service connection, we're typically serving a single TenantTimelineId.
|
||||
//!
|
||||
//! Without sharding, there is a single Timeline object to which we dispatch
|
||||
//! all requests. For example, a getpage request gets dispatched to the
|
||||
//! Timeline::get method of the Timeline object that represents the
|
||||
//! (tenant,timeline) of that connection.
|
||||
//!
|
||||
//! With sharding, for each request that comes in on the connection,
|
||||
//! we first have to perform shard routing based on the requested key (=~ page number).
|
||||
//! The result of shard routing is a Timeline object.
|
||||
//! We then dispatch the request to that Timeline object.
|
||||
//!
|
||||
//! Regardless of whether the tenant is sharded or not, we want to ensure that
|
||||
//! we hold the Timeline gate open while we're invoking the method on the
|
||||
//! Timeline object.
|
||||
//!
|
||||
//! However, we want to avoid the overhead of entering the gate for every
|
||||
//! method invocation.
|
||||
//!
|
||||
//! Further, for shard routing, we want to avoid calling the tenant manager to
|
||||
//! resolve the shard for every request. Instead, we want to cache the
|
||||
//! routing result so we can bypass the tenant manager for all subsequent requests
|
||||
//! that get routed to that shard.
|
||||
//!
|
||||
//! Regardless of how we accomplish the above, it should not
|
||||
//! prevent the Timeline from shutting down promptly.
|
||||
//!
|
||||
//! # Design
|
||||
//!
|
||||
//! There are three user-facing data structures:
|
||||
//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
|
||||
//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
|
||||
//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
|
||||
//! Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
|
||||
//!
|
||||
//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
|
||||
//!
|
||||
//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
|
||||
//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
|
||||
//!
|
||||
//! To dispatch a request, the page service connection calls `Cache::get`.
|
||||
//!
|
||||
//! A cache miss means we consult the tenant manager for shard routing,
|
||||
//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
|
||||
//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
|
||||
//! and the `Arc<HandleInner>` in the `PerTimelineState`.
|
||||
//!
|
||||
//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
|
||||
//! and find the `Weak<HandleInner>` in the cache.
|
||||
//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
|
||||
//!
|
||||
//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
|
||||
//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
|
||||
//!
|
||||
//! # Memory Management / How The Reference Cycle Is Broken
|
||||
//!
|
||||
//! The attentive reader may have noticed the strong reference cycle
|
||||
//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
|
||||
//!
|
||||
//! This cycle is intentional: while it exists, the `Cache` can upgrade its
|
||||
//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
|
||||
//!
|
||||
//! The cycle is broken by either
|
||||
//! - `PerTimelineState::shutdown` or
|
||||
//! - dropping the `Cache`.
|
||||
//!
|
||||
//! Concurrently existing `Handle`s will extend the existence of the cycle.
|
||||
//! However, since `Handle`s are short-lived and new `Handle`s are not
|
||||
//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
|
||||
//! that extension of the cycle is bounded.
|
||||
//!
|
||||
//! # Fast Path for Shard Routing
|
||||
//!
|
||||
//! The `Cache` has a fast path for shard routing to avoid calling into
|
||||
//! the tenant manager for every request.
|
||||
//!
|
||||
//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
|
||||
//!
|
||||
//! The current implementation uses the first entry in the hash map
|
||||
//! to determine the `ShardParameters` and derive the correct
|
||||
//! `ShardIndex` for the requested key.
|
||||
//!
|
||||
//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
|
||||
//!
|
||||
//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
|
||||
//! it's a hit.
|
||||
//!
|
||||
//! ## Cache invalidation
|
||||
//!
|
||||
//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
|
||||
//! The only reasons why an entry in the cache can become stale are:
|
||||
//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
|
||||
//! being detached, timeline or shard deleted, or pageserver is shutting down.
|
||||
//! 2. We're doing a shard split and new traffic should be routed to the child shards.
|
||||
//!
|
||||
//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
|
||||
//! timeline has shut down, and when that happens, we remove the entry from the cache.
|
||||
//!
|
||||
//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
|
||||
//! to the parent shard during a shard split. Eventually, the shard split task will
|
||||
//! shut down the parent => case (1).
|
||||
|
||||
use std::collections::hash_map;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Weak;
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use tracing::instrument;
|
||||
use tracing::trace;
|
||||
use utils::id::TimelineId;
|
||||
use utils::shard::ShardIndex;
|
||||
use utils::shard::ShardNumber;
|
||||
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
|
||||
/// The requirement for Debug is so that #[derive(Debug)] works in some places.
|
||||
pub(crate) trait Types: Sized + std::fmt::Debug {
|
||||
type TenantManagerError: Sized + std::fmt::Debug;
|
||||
type TenantManager: TenantManager<Self> + Sized;
|
||||
type Timeline: ArcTimeline<Self> + Sized;
|
||||
}
|
||||
|
||||
/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
|
||||
/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
|
||||
/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
|
||||
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
|
||||
struct CacheId(u64);
|
||||
|
||||
impl CacheId {
|
||||
fn next() -> Self {
|
||||
static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
|
||||
let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
if id == 0 {
|
||||
panic!("CacheId::new() returned 0, overflow");
|
||||
}
|
||||
Self(id)
|
||||
}
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Cache<T: Types> {
|
||||
id: CacheId,
|
||||
map: Map<T>,
|
||||
}
|
||||
|
||||
type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
|
||||
|
||||
impl<T: Types> Default for Cache<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
id: CacheId::next(),
|
||||
map: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
|
||||
pub(crate) struct ShardTimelineId {
|
||||
pub(crate) shard_index: ShardIndex,
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
/// See module-level comment.
|
||||
pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
|
||||
struct HandleInner<T: Types> {
|
||||
shut_down: AtomicBool,
|
||||
timeline: T::Timeline,
|
||||
// The timeline's gate held open.
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
|
||||
///
|
||||
/// See module-level comment for details.
|
||||
pub struct PerTimelineState<T: Types> {
|
||||
// None = shutting down
|
||||
handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
|
||||
}
|
||||
|
||||
impl<T: Types> Default for PerTimelineState<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
handles: Mutex::new(Some(Default::default())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Abstract view of [`crate::tenant::mgr`], for testability.
|
||||
pub(crate) trait TenantManager<T: Types> {
|
||||
/// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
|
||||
/// Errors are returned as [`GetError::TenantManager`].
|
||||
async fn resolve(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> Result<T::Timeline, T::TenantManagerError>;
|
||||
}
|
||||
|
||||
/// Abstract view of an [`Arc<Timeline>`], for testability.
|
||||
pub(crate) trait ArcTimeline<T: Types>: Clone {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate;
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId;
|
||||
fn get_shard_identity(&self) -> &ShardIdentity;
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<T>;
|
||||
}
|
||||
|
||||
/// Errors returned by [`Cache::get`].
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum GetError<T: Types> {
|
||||
TenantManager(T::TenantManagerError),
|
||||
TimelineGateClosed,
|
||||
PerTimelineStateShutDown,
|
||||
}
|
||||
|
||||
/// Internal type used in [`Cache::get`].
|
||||
enum RoutingResult<T: Types> {
|
||||
FastPath(Handle<T>),
|
||||
SlowPath(ShardTimelineId),
|
||||
NeedConsultTenantManager,
|
||||
}
|
||||
|
||||
impl<T: Types> Cache<T> {
|
||||
/// See module-level comment for details.
|
||||
///
|
||||
/// Does NOT check for the shutdown state of [`Types::Timeline`].
|
||||
/// Instead, the methods of [`Types::Timeline`] that are invoked through
|
||||
/// the [`Handle`] are responsible for checking these conditions
|
||||
/// and if so, return an error that causes the page service to
|
||||
/// close the connection.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(crate) async fn get(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
// terminates because each iteration removes an element from the map
|
||||
loop {
|
||||
let handle = self
|
||||
.get_impl(timeline_id, shard_selector, tenant_manager)
|
||||
.await?;
|
||||
if handle.0.shut_down.load(Ordering::Relaxed) {
|
||||
let removed = self
|
||||
.map
|
||||
.remove(&handle.0.timeline.shard_timeline_id())
|
||||
.expect("invariant of get_impl is that the returned handle is in the map");
|
||||
assert!(
|
||||
Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
|
||||
"shard_timeline_id() incorrect?"
|
||||
);
|
||||
} else {
|
||||
return Ok(handle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
async fn get_impl(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
let miss: ShardSelector = {
|
||||
let routing_state = self.shard_routing(timeline_id, shard_selector);
|
||||
match routing_state {
|
||||
RoutingResult::FastPath(handle) => return Ok(handle),
|
||||
RoutingResult::SlowPath(key) => match self.map.get(&key) {
|
||||
Some(cached) => match cached.upgrade() {
|
||||
Some(upgraded) => return Ok(Handle(upgraded)),
|
||||
None => {
|
||||
trace!("handle cache stale");
|
||||
self.map.remove(&key).unwrap();
|
||||
ShardSelector::Known(key.shard_index)
|
||||
}
|
||||
},
|
||||
None => ShardSelector::Known(key.shard_index),
|
||||
},
|
||||
RoutingResult::NeedConsultTenantManager => shard_selector,
|
||||
}
|
||||
};
|
||||
self.get_miss(timeline_id, miss, tenant_manager).await
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn shard_routing(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> RoutingResult<T> {
|
||||
loop {
|
||||
// terminates because when every iteration we remove an element from the map
|
||||
let Some((first_key, first_handle)) = self.map.iter().next() else {
|
||||
return RoutingResult::NeedConsultTenantManager;
|
||||
};
|
||||
let Some(first_handle) = first_handle.upgrade() else {
|
||||
// TODO: dedup with get()
|
||||
trace!("handle cache stale");
|
||||
let first_key_owned = *first_key;
|
||||
self.map.remove(&first_key_owned).unwrap();
|
||||
continue;
|
||||
};
|
||||
|
||||
let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
|
||||
let make_shard_index = |shard_num: ShardNumber| ShardIndex {
|
||||
shard_number: shard_num,
|
||||
shard_count: first_handle_shard_identity.count,
|
||||
};
|
||||
|
||||
let need_idx = match shard_selector {
|
||||
ShardSelector::Page(key) => {
|
||||
make_shard_index(first_handle_shard_identity.get_shard_number(&key))
|
||||
}
|
||||
ShardSelector::Zero => make_shard_index(ShardNumber(0)),
|
||||
ShardSelector::Known(shard_idx) => shard_idx,
|
||||
};
|
||||
let need_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: need_idx,
|
||||
timeline_id,
|
||||
};
|
||||
let first_handle_shard_timeline_id = ShardTimelineId {
|
||||
shard_index: first_handle_shard_identity.shard_index(),
|
||||
timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
|
||||
};
|
||||
|
||||
if need_shard_timeline_id == first_handle_shard_timeline_id {
|
||||
return RoutingResult::FastPath(Handle(first_handle));
|
||||
} else {
|
||||
return RoutingResult::SlowPath(need_shard_timeline_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
#[inline(always)]
|
||||
async fn get_miss(
|
||||
&mut self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
tenant_manager: &T::TenantManager,
|
||||
) -> Result<Handle<T>, GetError<T>> {
|
||||
match tenant_manager.resolve(timeline_id, shard_selector).await {
|
||||
Ok(timeline) => {
|
||||
let key = timeline.shard_timeline_id();
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
|
||||
ShardSelector::Page(_) => (), // gotta trust tenant_manager
|
||||
ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
|
||||
}
|
||||
|
||||
let gate_guard = match timeline.gate().enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
return Err(GetError::TimelineGateClosed);
|
||||
}
|
||||
};
|
||||
trace!("creating new HandleInner");
|
||||
let handle = Arc::new(
|
||||
// TODO: global metric that keeps track of the number of live HandlerTimeline instances
|
||||
// so we can identify reference cycle bugs.
|
||||
HandleInner {
|
||||
shut_down: AtomicBool::new(false),
|
||||
_gate_guard: gate_guard,
|
||||
timeline: timeline.clone(),
|
||||
},
|
||||
);
|
||||
let handle = {
|
||||
let mut lock_guard = timeline
|
||||
.per_timeline_state()
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned");
|
||||
match &mut *lock_guard {
|
||||
Some(per_timeline_state) => {
|
||||
let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
|
||||
assert!(replaced.is_none(), "some earlier code left a stale handle");
|
||||
match self.map.entry(key) {
|
||||
hash_map::Entry::Occupied(_o) => {
|
||||
// This cannot not happen because
|
||||
// 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
|
||||
// 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
|
||||
// while we were waiting for the tenant manager.
|
||||
unreachable!()
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(Arc::downgrade(&handle));
|
||||
handle
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return Err(GetError::PerTimelineStateShutDown);
|
||||
}
|
||||
}
|
||||
};
|
||||
Ok(Handle(handle))
|
||||
}
|
||||
Err(e) => Err(GetError::TenantManager(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> PerTimelineState<T> {
|
||||
/// After this method returns, [`Cache::get`] will never again return a [`Handle`]
|
||||
/// to the [`Types::Timeline`] that embeds this per-timeline state.
|
||||
/// Even if [`TenantManager::resolve`] would still resolve to it.
|
||||
///
|
||||
/// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
|
||||
/// That's ok because they're short-lived. See module-level comment for details.
|
||||
#[instrument(level = "trace", skip_all)]
|
||||
pub(super) fn shutdown(&self) {
|
||||
let handles = self
|
||||
.handles
|
||||
.lock()
|
||||
.expect("mutex poisoned")
|
||||
// NB: this .take() sets locked to None.
|
||||
// That's what makes future `Cache::get` misses fail.
|
||||
// Cache hits are taken care of below.
|
||||
.take();
|
||||
let Some(handles) = handles else {
|
||||
trace!("already shut down");
|
||||
return;
|
||||
};
|
||||
for handle in handles.values() {
|
||||
// Make hits fail.
|
||||
handle.shut_down.store(true, Ordering::Relaxed);
|
||||
}
|
||||
drop(handles);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Types> std::ops::Deref for Handle<T> {
|
||||
type Target = T::Timeline;
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0.timeline
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<T: Types> Drop for HandleInner<T> {
|
||||
fn drop(&mut self) {
|
||||
trace!("HandleInner dropped");
|
||||
}
|
||||
}
|
||||
|
||||
// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
|
||||
impl<T: Types> Drop for Cache<T> {
|
||||
fn drop(&mut self) {
|
||||
for (_, weak) in self.map.drain() {
|
||||
if let Some(strong) = weak.upgrade() {
|
||||
// handle is still being kept alive in PerTimelineState
|
||||
let timeline = strong.timeline.per_timeline_state();
|
||||
let mut handles = timeline.handles.lock().expect("mutex poisoned");
|
||||
if let Some(handles) = &mut *handles {
|
||||
let Some(removed) = handles.remove(&self.id) else {
|
||||
// There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
|
||||
continue;
|
||||
};
|
||||
assert!(Arc::ptr_eq(&removed, &strong));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::{
|
||||
key::{rel_block_to_key, Key, DBDIR_KEY},
|
||||
models::ShardParameters,
|
||||
reltag::RelTag,
|
||||
shard::ShardStripeSize,
|
||||
};
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
use super::*;
|
||||
|
||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestTypes;
|
||||
impl Types for TestTypes {
|
||||
type TenantManagerError = anyhow::Error;
|
||||
type TenantManager = StubManager;
|
||||
type Timeline = Arc<StubTimeline>;
|
||||
}
|
||||
|
||||
struct StubManager {
|
||||
shards: Vec<Arc<StubTimeline>>,
|
||||
}
|
||||
|
||||
struct StubTimeline {
|
||||
gate: utils::sync::gate::Gate,
|
||||
id: TimelineId,
|
||||
shard: ShardIdentity,
|
||||
per_timeline_state: PerTimelineState<TestTypes>,
|
||||
myself: Weak<StubTimeline>,
|
||||
}
|
||||
|
||||
impl StubTimeline {
|
||||
fn getpage(&self) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
|
||||
fn gate(&self) -> &utils::sync::gate::Gate {
|
||||
&self.gate
|
||||
}
|
||||
|
||||
fn shard_timeline_id(&self) -> ShardTimelineId {
|
||||
ShardTimelineId {
|
||||
shard_index: self.shard.shard_index(),
|
||||
timeline_id: self.id,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
&self.shard
|
||||
}
|
||||
|
||||
fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
|
||||
&self.per_timeline_state
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantManager<TestTypes> for StubManager {
|
||||
async fn resolve(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
shard_selector: ShardSelector,
|
||||
) -> anyhow::Result<Arc<StubTimeline>> {
|
||||
for timeline in &self.shards {
|
||||
if timeline.id == timeline_id {
|
||||
match &shard_selector {
|
||||
ShardSelector::Zero if timeline.shard.is_shard_zero() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Zero => continue,
|
||||
ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Page(_) => continue,
|
||||
ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
|
||||
return Ok(Arc::clone(timeline));
|
||||
}
|
||||
ShardSelector::Known(_) => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
anyhow::bail!("not found")
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_timeline_shutdown() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
//
|
||||
// fill the cache
|
||||
//
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
let handle: Handle<_> = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
let handle_inner_weak = Arc::downgrade(&handle.0);
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
assert_eq!(
|
||||
(
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
Weak::weak_count(&handle_inner_weak)
|
||||
),
|
||||
(2, 2),
|
||||
"strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
|
||||
);
|
||||
assert_eq!(cache.map.len(), 1);
|
||||
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
//
|
||||
// demonstrate that Handle holds up gate closure
|
||||
// but shutdown prevents new handles from being handed out
|
||||
//
|
||||
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("cache and per-timeline handler state keep cache open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
// NB: first poll of close() makes it enter closing state
|
||||
}
|
||||
}
|
||||
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
|
||||
// SHUTDOWN
|
||||
shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"through local var handle"
|
||||
);
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
1,
|
||||
"this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(3, 1),
|
||||
"strong: handleinner(via handle), shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// this handle is perfectly usable
|
||||
handle.getpage();
|
||||
|
||||
cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
0,
|
||||
"first access after shutdown cleans up the Weak's from the cache"
|
||||
);
|
||||
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => {
|
||||
panic!("handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
|
||||
drop(handle);
|
||||
assert_eq!(
|
||||
0,
|
||||
Weak::strong_count(&handle_inner_weak),
|
||||
"the HandleInner destructor already ran"
|
||||
);
|
||||
assert_eq!(
|
||||
(Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
|
||||
(2, 1),
|
||||
"strong: shard0, mgr; weak: myself"
|
||||
);
|
||||
|
||||
// closing gate succeeds after dropping handle
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
|
||||
// map gets cleaned on next lookup
|
||||
cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
assert_eq!(cache.map.len(), 0);
|
||||
|
||||
// ensure all refs to shard0 are gone and we're not leaking anything
|
||||
let myself = Weak::clone(&shard0.myself);
|
||||
drop(shard0);
|
||||
drop(mgr);
|
||||
assert_eq!(Weak::strong_count(&myself), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_multiple_timelines_and_deletion() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
|
||||
let timeline_a = TimelineId::generate();
|
||||
let timeline_b = TimelineId::generate();
|
||||
assert_ne!(timeline_a, timeline_b);
|
||||
let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_a,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_b,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mut mgr = StubManager {
|
||||
shards: vec![timeline_a.clone(), timeline_b.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
cache
|
||||
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have it");
|
||||
cache
|
||||
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert_eq!(cache.map.len(), 2);
|
||||
|
||||
// delete timeline A
|
||||
timeline_a.per_timeline_state.shutdown();
|
||||
mgr.shards.retain(|t| t.id != timeline_a.id);
|
||||
assert!(
|
||||
mgr.resolve(timeline_a.id, ShardSelector::Page(key))
|
||||
.await
|
||||
.is_err(),
|
||||
"broken StubManager implementation"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
cache.map.len(),
|
||||
2,
|
||||
"cache still has a Weak handle to Timeline A"
|
||||
);
|
||||
cache
|
||||
.get(timeline_a.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.err()
|
||||
.expect("documented behavior: can't get new handle after shutdown");
|
||||
assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
|
||||
|
||||
cache
|
||||
.get(timeline_b.id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we still have it");
|
||||
}
|
||||
|
||||
fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
|
||||
rel_block_to_key(
|
||||
RelTag {
|
||||
spcnode: 1663,
|
||||
dbnode: 208101,
|
||||
relnode: 2620,
|
||||
forknum: 0,
|
||||
},
|
||||
shard.0 as u32 * params.stripe_size.0,
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_shard_split() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let parent = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child_params = ShardParameters {
|
||||
count: ShardCount(2),
|
||||
stripe_size: ShardStripeSize::default(),
|
||||
};
|
||||
let child0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child1 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let child_shards_by_shard_number = [child0.clone(), child1.clone()];
|
||||
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
|
||||
// fill the cache with the parent
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![parent.clone()],
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(&handle.myself, &parent.myself),
|
||||
"mgr returns parent first"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
//
|
||||
// SHARD SPLIT: tenant manager changes, but the cache isn't informed
|
||||
//
|
||||
|
||||
// while we haven't shut down the parent, the cache will return the cached parent, even
|
||||
// if the tenant manager returns the child
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![], // doesn't matter what's in here, the cache is fully loaded
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(&handle.myself, &parent.myself),
|
||||
"mgr returns parent"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
let parent_handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![parent.clone()],
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
|
||||
|
||||
// invalidate the cache
|
||||
parent.per_timeline_state.shutdown();
|
||||
|
||||
// the cache will now return the child, even though the parent handle still exists
|
||||
for i in 0..2 {
|
||||
let handle = cache
|
||||
.get(
|
||||
timeline_id,
|
||||
ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
|
||||
&StubManager {
|
||||
shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
|
||||
},
|
||||
)
|
||||
.await
|
||||
.expect("we have it");
|
||||
assert!(
|
||||
Weak::ptr_eq(
|
||||
&handle.myself,
|
||||
&child_shards_by_shard_number[i as usize].myself
|
||||
),
|
||||
"mgr returns child"
|
||||
);
|
||||
drop(handle);
|
||||
}
|
||||
|
||||
// all the while the parent handle kept the parent gate open
|
||||
tokio::select! {
|
||||
_ = parent_handle.gate.close() => {
|
||||
panic!("parent handle is keeping gate open");
|
||||
}
|
||||
_ = tokio::time::sleep(FOREVER) => { }
|
||||
}
|
||||
drop(parent_handle);
|
||||
tokio::select! {
|
||||
_ = parent.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("parent handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn test_connection_handler_exit() {
|
||||
crate::tenant::harness::setup_logging();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let shard0 = Arc::new_cyclic(|myself| StubTimeline {
|
||||
gate: Default::default(),
|
||||
id: timeline_id,
|
||||
shard: ShardIdentity::unsharded(),
|
||||
per_timeline_state: PerTimelineState::default(),
|
||||
myself: myself.clone(),
|
||||
});
|
||||
let mgr = StubManager {
|
||||
shards: vec![shard0.clone()],
|
||||
};
|
||||
let key = DBDIR_KEY;
|
||||
|
||||
// Simulate 10 connections that's opened, used, and closed
|
||||
let mut used_handles = vec![];
|
||||
for _ in 0..10 {
|
||||
let mut cache = Cache::<TestTypes>::default();
|
||||
let handle = {
|
||||
let handle = cache
|
||||
.get(timeline_id, ShardSelector::Page(key), &mgr)
|
||||
.await
|
||||
.expect("we have the timeline");
|
||||
assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
|
||||
handle
|
||||
};
|
||||
handle.getpage();
|
||||
used_handles.push(Arc::downgrade(&handle.0));
|
||||
}
|
||||
|
||||
// No handles exist, thus gates are closed and don't require shutdown
|
||||
assert!(used_handles
|
||||
.iter()
|
||||
.all(|weak| Weak::strong_count(weak) == 0));
|
||||
|
||||
// ... thus the gate should close immediately, even without shutdown
|
||||
tokio::select! {
|
||||
_ = shard0.gate.close() => { }
|
||||
_ = tokio::time::sleep(FOREVER) => {
|
||||
panic!("handle is dropped, no other gate holders exist")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -35,6 +35,10 @@ impl LayerManager {
|
||||
self.layer_fmgr.get_from_desc(desc)
|
||||
}
|
||||
|
||||
pub(crate) fn get_from_key(&self, desc: &PersistentLayerKey) -> Layer {
|
||||
self.layer_fmgr.get_from_key(desc)
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the layer map.
|
||||
///
|
||||
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
|
||||
@@ -255,13 +259,10 @@ impl LayerManager {
|
||||
new_layer.layer_desc().lsn_range
|
||||
);
|
||||
|
||||
// Transfer visibilty hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
|
||||
// Transfer visibility hint from old to new layer, since the new layer covers the same key space. This is not guaranteed to
|
||||
// be accurate (as the new layer may cover a different subset of the key range), but is a sensible default, and prevents
|
||||
// always marking rewritten layers as visible.
|
||||
new_layer
|
||||
.as_ref()
|
||||
.access_stats()
|
||||
.set_visibility(old_layer.access_stats().visibility());
|
||||
new_layer.as_ref().set_visibility(old_layer.visibility());
|
||||
|
||||
// Safety: we may never rewrite the same file in-place. Callers are responsible
|
||||
// for ensuring that they only rewrite layers after something changes the path,
|
||||
@@ -365,16 +366,20 @@ impl<T> Default for LayerFileManager<T> {
|
||||
}
|
||||
|
||||
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
||||
fn get_from_key(&self, key: &PersistentLayerKey) -> T {
|
||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||
// A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
|
||||
self.0
|
||||
.get(&desc.key())
|
||||
.with_context(|| format!("get layer from desc: {}", desc.layer_name()))
|
||||
.get(key)
|
||||
.with_context(|| format!("get layer from key: {}", key))
|
||||
.expect("not found")
|
||||
.clone()
|
||||
}
|
||||
|
||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
||||
self.get_from_key(&desc.key())
|
||||
}
|
||||
|
||||
fn contains_key(&self, key: &PersistentLayerKey) -> bool {
|
||||
self.0.contains_key(key)
|
||||
}
|
||||
|
||||
@@ -122,6 +122,10 @@ impl CurrentLogicalSize {
|
||||
Self::Exact(_) => Accuracy::Exact,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_exact(&self) -> bool {
|
||||
matches!(self, Self::Exact(_))
|
||||
}
|
||||
}
|
||||
|
||||
impl LogicalSize {
|
||||
|
||||
@@ -241,6 +241,9 @@ impl PostgresRedoManager {
|
||||
|
||||
/// Shut down the WAL redo manager.
|
||||
///
|
||||
/// Returns `true` if this call was the one that initiated shutdown.
|
||||
/// `true` may be observed by no caller if the first caller stops polling.
|
||||
///
|
||||
/// After this future completes
|
||||
/// - no redo process is running
|
||||
/// - no new redo process will be spawned
|
||||
@@ -250,22 +253,32 @@ impl PostgresRedoManager {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn shutdown(&self) {
|
||||
pub async fn shutdown(&self) -> bool {
|
||||
// prevent new processes from being spawned
|
||||
let permit = match self.redo_process.get_or_init_detached().await {
|
||||
let maybe_permit = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => {
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
permit
|
||||
if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
|
||||
None
|
||||
} else {
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
Some(permit)
|
||||
}
|
||||
}
|
||||
Err(permit) => permit,
|
||||
Err(permit) => Some(permit),
|
||||
};
|
||||
let it_was_us = if let Some(permit) = maybe_permit {
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
|
||||
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
|
||||
// for the underlying process.
|
||||
self.launched_processes.close().await;
|
||||
it_was_us
|
||||
}
|
||||
|
||||
/// This type doesn't have its own background task to check for idleness: we
|
||||
|
||||
7
pageserver/test_data/indices/mixed_workload/README.md
Normal file
7
pageserver/test_data/indices/mixed_workload/README.md
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
# This was captured from one shard of a large tenant in staging.
|
||||
|
||||
# It has a mixture of deltas and image layers, >1000 layers in total.
|
||||
|
||||
# This is suitable for general smoke tests that want an index which is not
|
||||
# trivially small, but doesn't contain weird/pathological cases.
|
||||
File diff suppressed because one or more lines are too long
16
poetry.lock
generated
16
poetry.lock
generated
@@ -1514,6 +1514,20 @@ files = [
|
||||
[package.dependencies]
|
||||
six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "kafka-python"
|
||||
version = "2.0.2"
|
||||
description = "Pure Python client for Apache Kafka"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "kafka-python-2.0.2.tar.gz", hash = "sha256:04dfe7fea2b63726cd6f3e79a2d86e709d608d74406638c5da33a01d45a9d7e3"},
|
||||
{file = "kafka_python-2.0.2-py2.py3-none-any.whl", hash = "sha256:2d92418c7cb1c298fa6c7f0fb3519b520d0d7526ac6cb7ae2a4fc65a51a94b6e"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
crc32c = ["crc32c"]
|
||||
|
||||
[[package]]
|
||||
name = "lazy-object-proxy"
|
||||
version = "1.10.0"
|
||||
@@ -3357,4 +3371,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
|
||||
content-hash = "d569a3593b98baceb0a88e176bdad63cae99d6bfc2a81bf6741663a4abcafd72"
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
[tool.poetry]
|
||||
name = "neon"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = []
|
||||
package-mode = false
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
@@ -42,6 +41,7 @@ httpx = {extras = ["http2"], version = "^0.26.0"}
|
||||
pytest-repeat = "^0.9.3"
|
||||
websockets = "^12.0"
|
||||
clickhouse-connect = "^0.7.16"
|
||||
kafka-python = "^2.0.2"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
mypy = "==1.3.0"
|
||||
@@ -75,6 +75,7 @@ module = [
|
||||
"allure.*",
|
||||
"allure_commons.*",
|
||||
"allure_pytest.*",
|
||||
"kafka.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
@@ -170,11 +170,6 @@ struct Args {
|
||||
/// still needed for existing replication connection.
|
||||
#[arg(long)]
|
||||
walsenders_keep_horizon: bool,
|
||||
/// Enable partial backup. If disabled, safekeeper will not upload partial
|
||||
/// segments to remote storage.
|
||||
/// TODO: now partial backup is always enabled, remove this flag.
|
||||
#[arg(long)]
|
||||
partial_backup_enabled: bool,
|
||||
/// Controls how long backup will wait until uploading the partial segment.
|
||||
#[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)]
|
||||
partial_backup_timeout: Duration,
|
||||
@@ -347,7 +342,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
sk_auth_token,
|
||||
current_thread_runtime: args.current_thread_runtime,
|
||||
walsenders_keep_horizon: args.walsenders_keep_horizon,
|
||||
partial_backup_enabled: true,
|
||||
partial_backup_timeout: args.partial_backup_timeout,
|
||||
disable_periodic_broker_push: args.disable_periodic_broker_push,
|
||||
enable_offload: args.enable_offload,
|
||||
|
||||
@@ -21,6 +21,7 @@ pub mod json_ctrl;
|
||||
pub mod metrics;
|
||||
pub mod patch_control_file;
|
||||
pub mod pull_timeline;
|
||||
pub mod rate_limit;
|
||||
pub mod receive_wal;
|
||||
pub mod recovery;
|
||||
pub mod remove_wal;
|
||||
@@ -53,6 +54,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
|
||||
pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
|
||||
pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
|
||||
pub const DEFAULT_EVICTION_CONCURRENCY: usize = 2;
|
||||
|
||||
// By default, our required residency before eviction is the same as the period that passes
|
||||
// before uploading a partial segment, so that in normal operation the eviction can happen
|
||||
@@ -91,7 +93,6 @@ pub struct SafeKeeperConf {
|
||||
pub sk_auth_token: Option<SecretString>,
|
||||
pub current_thread_runtime: bool,
|
||||
pub walsenders_keep_horizon: bool,
|
||||
pub partial_backup_enabled: bool,
|
||||
pub partial_backup_timeout: Duration,
|
||||
pub disable_periodic_broker_push: bool,
|
||||
pub enable_offload: bool,
|
||||
@@ -135,7 +136,6 @@ impl SafeKeeperConf {
|
||||
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
current_thread_runtime: false,
|
||||
walsenders_keep_horizon: false,
|
||||
partial_backup_enabled: false,
|
||||
partial_backup_timeout: Duration::from_secs(0),
|
||||
disable_periodic_broker_push: false,
|
||||
enable_offload: false,
|
||||
|
||||
49
safekeeper/src/rate_limit.rs
Normal file
49
safekeeper/src/rate_limit.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use rand::Rng;
|
||||
|
||||
use crate::metrics::MISC_OPERATION_SECONDS;
|
||||
|
||||
/// Global rate limiter for background tasks.
|
||||
#[derive(Clone)]
|
||||
pub struct RateLimiter {
|
||||
partial_backup: Arc<tokio::sync::Semaphore>,
|
||||
eviction: Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
|
||||
impl RateLimiter {
|
||||
/// Create a new rate limiter.
|
||||
/// - `partial_backup_max`: maximum number of concurrent partial backups.
|
||||
/// - `eviction_max`: maximum number of concurrent timeline evictions.
|
||||
pub fn new(partial_backup_max: usize, eviction_max: usize) -> Self {
|
||||
Self {
|
||||
partial_backup: Arc::new(tokio::sync::Semaphore::new(partial_backup_max)),
|
||||
eviction: Arc::new(tokio::sync::Semaphore::new(eviction_max)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a permit for partial backup. This will block if the maximum number of concurrent
|
||||
/// partial backups is reached.
|
||||
pub async fn acquire_partial_backup(&self) -> tokio::sync::OwnedSemaphorePermit {
|
||||
let _timer = MISC_OPERATION_SECONDS
|
||||
.with_label_values(&["partial_permit_acquire"])
|
||||
.start_timer();
|
||||
self.partial_backup
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.expect("semaphore is closed")
|
||||
}
|
||||
|
||||
/// Try to get a permit for timeline eviction. This will return None if the maximum number of
|
||||
/// concurrent timeline evictions is reached.
|
||||
pub fn try_acquire_eviction(&self) -> Option<tokio::sync::OwnedSemaphorePermit> {
|
||||
self.eviction.clone().try_acquire_owned().ok()
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a random duration that is a fraction of the given duration.
|
||||
pub fn rand_duration(duration: &std::time::Duration) -> std::time::Duration {
|
||||
let randf64 = rand::thread_rng().gen_range(0.0..1.0);
|
||||
duration.mul_f64(randf64)
|
||||
}
|
||||
@@ -25,6 +25,7 @@ use utils::{
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
|
||||
use crate::rate_limit::RateLimiter;
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
|
||||
@@ -36,7 +37,7 @@ use crate::timeline_guard::ResidenceGuard;
|
||||
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::{self};
|
||||
use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
|
||||
use crate::wal_backup_partial::PartialRemoteSegment;
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
|
||||
use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use remote_storage::RemotePath;
|
||||
use std::time::Instant;
|
||||
use tokio::{
|
||||
fs::File,
|
||||
io::{AsyncRead, AsyncWriteExt},
|
||||
@@ -15,6 +14,7 @@ use utils::crashsafe::durable_rename;
|
||||
|
||||
use crate::{
|
||||
metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
|
||||
rate_limit::rand_duration,
|
||||
timeline_manager::{Manager, StateSnapshot},
|
||||
wal_backup,
|
||||
wal_backup_partial::{self, PartialRemoteSegment},
|
||||
@@ -50,7 +50,6 @@ impl Manager {
|
||||
.flush_lsn
|
||||
.segment_number(self.wal_seg_size)
|
||||
== self.last_removed_segno + 1
|
||||
&& self.resident_since.elapsed() >= self.conf.eviction_min_resident
|
||||
}
|
||||
|
||||
/// Evict the timeline to remote storage.
|
||||
@@ -112,7 +111,8 @@ impl Manager {
|
||||
return;
|
||||
}
|
||||
|
||||
self.resident_since = Instant::now();
|
||||
self.evict_not_before =
|
||||
tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
|
||||
|
||||
info!("successfully restored evicted timeline");
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
|
||||
use crate::{
|
||||
control_file::{FileStorage, Storage},
|
||||
metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
|
||||
rate_limit::{rand_duration, RateLimiter},
|
||||
recovery::recovery_main,
|
||||
remove_wal::calc_horizon_lsn,
|
||||
safekeeper::Term,
|
||||
@@ -32,7 +33,7 @@ use crate::{
|
||||
timeline_guard::{AccessService, GuardId, ResidenceGuard},
|
||||
timelines_set::{TimelineSetGuard, TimelinesSet},
|
||||
wal_backup::{self, WalBackupTaskHandle},
|
||||
wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
|
||||
wal_backup_partial::{self, PartialRemoteSegment},
|
||||
SafeKeeperConf,
|
||||
};
|
||||
|
||||
@@ -185,11 +186,11 @@ pub(crate) struct Manager {
|
||||
|
||||
// misc
|
||||
pub(crate) access_service: AccessService,
|
||||
pub(crate) partial_backup_rate_limiter: RateLimiter,
|
||||
pub(crate) global_rate_limiter: RateLimiter,
|
||||
|
||||
// Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
|
||||
// evict them if they go inactive very soon after being restored.
|
||||
pub(crate) resident_since: std::time::Instant,
|
||||
pub(crate) evict_not_before: Instant,
|
||||
}
|
||||
|
||||
/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
|
||||
@@ -202,7 +203,7 @@ pub async fn main_task(
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
||||
mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
|
||||
partial_backup_rate_limiter: RateLimiter,
|
||||
global_rate_limiter: RateLimiter,
|
||||
) {
|
||||
tli.set_status(Status::Started);
|
||||
|
||||
@@ -220,7 +221,7 @@ pub async fn main_task(
|
||||
conf,
|
||||
broker_active_set,
|
||||
manager_tx,
|
||||
partial_backup_rate_limiter,
|
||||
global_rate_limiter,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -254,9 +255,29 @@ pub async fn main_task(
|
||||
mgr.set_status(Status::UpdatePartialBackup);
|
||||
mgr.update_partial_backup(&state_snapshot).await;
|
||||
|
||||
if mgr.conf.enable_offload && mgr.ready_for_eviction(&next_event, &state_snapshot) {
|
||||
mgr.set_status(Status::EvictTimeline);
|
||||
mgr.evict_timeline().await;
|
||||
let now = Instant::now();
|
||||
if mgr.evict_not_before > now {
|
||||
// we should wait until evict_not_before
|
||||
update_next_event(&mut next_event, mgr.evict_not_before);
|
||||
}
|
||||
|
||||
if mgr.conf.enable_offload
|
||||
&& mgr.evict_not_before <= now
|
||||
&& mgr.ready_for_eviction(&next_event, &state_snapshot)
|
||||
{
|
||||
// check rate limiter and evict timeline if possible
|
||||
match mgr.global_rate_limiter.try_acquire_eviction() {
|
||||
Some(_permit) => {
|
||||
mgr.set_status(Status::EvictTimeline);
|
||||
mgr.evict_timeline().await;
|
||||
}
|
||||
None => {
|
||||
// we can't evict timeline now, will try again later
|
||||
mgr.evict_not_before =
|
||||
Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
|
||||
update_next_event(&mut next_event, mgr.evict_not_before);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -334,11 +355,10 @@ impl Manager {
|
||||
conf: SafeKeeperConf,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
||||
partial_backup_rate_limiter: RateLimiter,
|
||||
global_rate_limiter: RateLimiter,
|
||||
) -> Manager {
|
||||
let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
|
||||
Manager {
|
||||
conf,
|
||||
wal_seg_size: tli.get_wal_seg_size().await,
|
||||
walsenders: tli.get_walsenders().clone(),
|
||||
state_version_rx: tli.get_state_version_rx(),
|
||||
@@ -353,8 +373,10 @@ impl Manager {
|
||||
partial_backup_uploaded,
|
||||
access_service: AccessService::new(manager_tx),
|
||||
tli,
|
||||
partial_backup_rate_limiter,
|
||||
resident_since: std::time::Instant::now(),
|
||||
global_rate_limiter,
|
||||
// to smooth out evictions spike after restart
|
||||
evict_not_before: Instant::now() + rand_duration(&conf.eviction_min_resident),
|
||||
conf,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -522,8 +544,8 @@ impl Manager {
|
||||
|
||||
/// Spawns partial WAL backup task if needed.
|
||||
async fn update_partial_backup(&mut self, state: &StateSnapshot) {
|
||||
// check if partial backup is enabled and should be started
|
||||
if !self.conf.is_wal_backup_enabled() || !self.conf.partial_backup_enabled {
|
||||
// check if WAL backup is enabled and should be started
|
||||
if !self.conf.is_wal_backup_enabled() {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -541,7 +563,7 @@ impl Manager {
|
||||
self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
|
||||
self.wal_resident_timeline(),
|
||||
self.conf.clone(),
|
||||
self.partial_backup_rate_limiter.clone(),
|
||||
self.global_rate_limiter.clone(),
|
||||
)));
|
||||
}
|
||||
|
||||
|
||||
@@ -2,10 +2,11 @@
|
||||
//! All timelines should always be present in this map, this is done by loading them
|
||||
//! all from the disk on startup and keeping them in memory.
|
||||
|
||||
use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
|
||||
use crate::rate_limit::RateLimiter;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup_partial::RateLimiter;
|
||||
use crate::SafeKeeperConf;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
@@ -31,7 +32,7 @@ struct GlobalTimelinesState {
|
||||
conf: Option<SafeKeeperConf>,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
|
||||
partial_backup_rate_limiter: RateLimiter,
|
||||
global_rate_limiter: RateLimiter,
|
||||
}
|
||||
|
||||
// Used to prevent concurrent timeline loading.
|
||||
@@ -50,7 +51,7 @@ impl GlobalTimelinesState {
|
||||
(
|
||||
self.get_conf().clone(),
|
||||
self.broker_active_set.clone(),
|
||||
self.partial_backup_rate_limiter.clone(),
|
||||
self.global_rate_limiter.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -85,7 +86,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
|
||||
conf: None,
|
||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||
load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
|
||||
partial_backup_rate_limiter: RateLimiter::new(1),
|
||||
global_rate_limiter: RateLimiter::new(1, 1),
|
||||
})
|
||||
});
|
||||
|
||||
@@ -99,7 +100,10 @@ impl GlobalTimelines {
|
||||
// lock, so use explicit block
|
||||
let tenants_dir = {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
|
||||
state.global_rate_limiter = RateLimiter::new(
|
||||
conf.partial_backup_concurrency,
|
||||
DEFAULT_EVICTION_CONCURRENCY,
|
||||
);
|
||||
state.conf = Some(conf);
|
||||
|
||||
// Iterate through all directories and load tenants for all directories
|
||||
|
||||
@@ -18,8 +18,6 @@
|
||||
//! This way control file stores information about all potentially existing
|
||||
//! remote partial segments and can clean them up after uploading a newer version.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
|
||||
use remote_storage::RemotePath;
|
||||
@@ -30,6 +28,7 @@ use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
|
||||
rate_limit::{rand_duration, RateLimiter},
|
||||
safekeeper::Term,
|
||||
timeline::WalResidentTimeline,
|
||||
timeline_manager::StateSnapshot,
|
||||
@@ -37,30 +36,6 @@ use crate::{
|
||||
SafeKeeperConf,
|
||||
};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RateLimiter {
|
||||
semaphore: Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
|
||||
impl RateLimiter {
|
||||
pub fn new(permits: usize) -> Self {
|
||||
Self {
|
||||
semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
|
||||
let _timer = MISC_OPERATION_SECONDS
|
||||
.with_label_values(&["partial_permit_acquire"])
|
||||
.start_timer();
|
||||
self.semaphore
|
||||
.clone()
|
||||
.acquire_owned()
|
||||
.await
|
||||
.expect("semaphore is closed")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub enum UploadStatus {
|
||||
/// Upload is in progress. This status should be used only for garbage collection,
|
||||
@@ -352,6 +327,7 @@ pub async fn main_task(
|
||||
) -> Option<PartialRemoteSegment> {
|
||||
debug!("started");
|
||||
let await_duration = conf.partial_backup_timeout;
|
||||
let mut first_iteration = true;
|
||||
|
||||
let (_, persistent_state) = tli.get_state().await;
|
||||
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
||||
@@ -419,6 +395,15 @@ pub async fn main_task(
|
||||
}
|
||||
}
|
||||
|
||||
// smoothing the load after restart, by sleeping for a random time.
|
||||
// if this is not the first iteration, we will wait for the full await_duration
|
||||
let await_duration = if first_iteration {
|
||||
first_iteration = false;
|
||||
rand_duration(&await_duration)
|
||||
} else {
|
||||
await_duration
|
||||
};
|
||||
|
||||
// fixing the segno and waiting some time to prevent reuploading the same segment too often
|
||||
let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn);
|
||||
let timeout = tokio::time::sleep(await_duration);
|
||||
@@ -454,7 +439,7 @@ pub async fn main_task(
|
||||
}
|
||||
|
||||
// limit concurrent uploads
|
||||
let _upload_permit = limiter.acquire_owned().await;
|
||||
let _upload_permit = limiter.acquire_partial_backup().await;
|
||||
|
||||
let prepared = backup.prepare_upload().await;
|
||||
if let Some(seg) = &uploaded_segment {
|
||||
|
||||
@@ -181,7 +181,6 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
|
||||
sk_auth_token: None,
|
||||
current_thread_runtime: false,
|
||||
walsenders_keep_horizon: false,
|
||||
partial_backup_enabled: false,
|
||||
partial_backup_timeout: Duration::from_secs(0),
|
||||
disable_periodic_broker_push: false,
|
||||
enable_offload: false,
|
||||
|
||||
@@ -67,6 +67,7 @@ FALLBACK_DURATION = {
|
||||
"test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
|
||||
"test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
|
||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
|
||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
|
||||
|
||||
@@ -642,8 +642,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
|
||||
info!("version: {GIT_VERSION}");
|
||||
info!("build_tag: {BUILD_TAG}");
|
||||
info!("version: {GIT_VERSION} build_tag: {BUILD_TAG}");
|
||||
metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
|
||||
// On any shutdown signal, log receival and exit.
|
||||
|
||||
@@ -32,6 +32,7 @@ once_cell.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
rand.workspace = true
|
||||
reqwest = { workspace = true, features = ["stream"] }
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
|
||||
@@ -9,12 +9,14 @@ use std::time::Duration;
|
||||
use storage_controller::http::make_router;
|
||||
use storage_controller::metrics::preinitialize_metrics;
|
||||
use storage_controller::persistence::Persistence;
|
||||
use storage_controller::service::chaos_injector::ChaosInjector;
|
||||
use storage_controller::service::{
|
||||
Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
|
||||
RECONCILER_CONCURRENCY_DEFAULT,
|
||||
};
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::logging::{self, LogFormat};
|
||||
|
||||
@@ -86,6 +88,10 @@ struct Cli {
|
||||
// TODO: make `cfg(feature = "testing")`
|
||||
#[arg(long)]
|
||||
neon_local_repo_dir: Option<PathBuf>,
|
||||
|
||||
/// Chaos testing
|
||||
#[arg(long)]
|
||||
chaos_interval: Option<humantime::Duration>,
|
||||
}
|
||||
|
||||
enum StrictMode {
|
||||
@@ -309,6 +315,22 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
let server_task = tokio::task::spawn(server);
|
||||
|
||||
let chaos_task = args.chaos_interval.map(|interval| {
|
||||
let service = service.clone();
|
||||
let cancel = CancellationToken::new();
|
||||
let cancel_bg = cancel.clone();
|
||||
(
|
||||
tokio::task::spawn(
|
||||
async move {
|
||||
let mut chaos_injector = ChaosInjector::new(service, interval.into());
|
||||
chaos_injector.run(cancel_bg).await
|
||||
}
|
||||
.instrument(tracing::info_span!("chaos_injector")),
|
||||
),
|
||||
cancel,
|
||||
)
|
||||
});
|
||||
|
||||
// Wait until we receive a signal
|
||||
let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
|
||||
let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
|
||||
@@ -337,6 +359,12 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// If we were injecting chaos, stop that so that we're not calling into Service while it shuts down
|
||||
if let Some((chaos_jh, chaos_cancel)) = chaos_task {
|
||||
chaos_cancel.cancel();
|
||||
chaos_jh.await.ok();
|
||||
}
|
||||
|
||||
service.shutdown().await;
|
||||
tracing::info!("Service shutdown complete");
|
||||
|
||||
|
||||
@@ -656,11 +656,8 @@ impl Reconciler {
|
||||
// reconcile this location. This includes locations with different configurations, as well
|
||||
// as locations with unknown (None) observed state.
|
||||
|
||||
// The general case is to increment the generation. However, there are cases
|
||||
// where this is not necessary:
|
||||
// - if we are only updating the TenantConf part of the location
|
||||
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
|
||||
// and the location was already in the correct generation
|
||||
// Incrementing generation is the safe general case, but is inefficient for changes that only
|
||||
// modify some details (e.g. the tenant's config).
|
||||
let increment_generation = match observed {
|
||||
None => true,
|
||||
Some(ObservedStateLocation { conf: None }) => true,
|
||||
@@ -669,18 +666,11 @@ impl Reconciler {
|
||||
}) => {
|
||||
let generations_match = observed.generation == wanted_conf.generation;
|
||||
|
||||
use LocationConfigMode::*;
|
||||
let mode_transition_requires_gen_inc =
|
||||
match (observed.mode, wanted_conf.mode) {
|
||||
// Usually the short-lived attachment modes (multi and stale) are only used
|
||||
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
|
||||
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
|
||||
(AttachedSingle, AttachedStale) => false,
|
||||
(AttachedMulti, AttachedSingle) => false,
|
||||
(lhs, rhs) => lhs != rhs,
|
||||
};
|
||||
|
||||
!generations_match || mode_transition_requires_gen_inc
|
||||
// We may skip incrementing the generation if the location is already in the expected mode and
|
||||
// generation. In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
|
||||
// but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
|
||||
// after a restart/crash, so fall back to the universally safe path of incrementing generation.
|
||||
!generations_match || (observed.mode != wanted_conf.mode)
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -84,6 +84,8 @@ use crate::{
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub mod chaos_injector;
|
||||
|
||||
// For operations that should be quick, like attaching a new tenant
|
||||
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
|
||||
|
||||
71
storage_controller/src/service/chaos_injector.rs
Normal file
71
storage_controller/src/service/chaos_injector.rs
Normal file
@@ -0,0 +1,71 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use super::Service;
|
||||
|
||||
pub struct ChaosInjector {
|
||||
service: Arc<Service>,
|
||||
interval: Duration,
|
||||
}
|
||||
|
||||
impl ChaosInjector {
|
||||
pub fn new(service: Arc<Service>, interval: Duration) -> Self {
|
||||
Self { service, interval }
|
||||
}
|
||||
|
||||
pub async fn run(&mut self, cancel: CancellationToken) {
|
||||
let mut interval = tokio::time::interval(self.interval);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => {}
|
||||
_ = cancel.cancelled() => {
|
||||
tracing::info!("Shutting down");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
self.inject_chaos().await;
|
||||
|
||||
tracing::info!("Chaos iteration...");
|
||||
}
|
||||
}
|
||||
|
||||
async fn inject_chaos(&mut self) {
|
||||
// Pick some shards to interfere with
|
||||
let batch_size = 128;
|
||||
let mut inner = self.service.inner.write().unwrap();
|
||||
let (nodes, tenants, scheduler) = inner.parts_mut();
|
||||
let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
|
||||
let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
|
||||
|
||||
for victim in victims {
|
||||
let shard = tenants
|
||||
.get_mut(victim)
|
||||
.expect("Held lock between choosing ID and this get");
|
||||
|
||||
// Pick a secondary to promote
|
||||
let Some(new_location) = shard
|
||||
.intent
|
||||
.get_secondary()
|
||||
.choose(&mut thread_rng())
|
||||
.cloned()
|
||||
else {
|
||||
tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
|
||||
continue;
|
||||
};
|
||||
|
||||
let Some(old_location) = *shard.intent.get_attached() else {
|
||||
tracing::info!("Skipping shard {victim}: currently has no attached location");
|
||||
continue;
|
||||
};
|
||||
|
||||
shard.intent.demote_attached(scheduler, old_location);
|
||||
shard.intent.promote_attached(scheduler, new_location);
|
||||
self.service.maybe_reconcile_shard(shard, nodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,6 +10,7 @@ aws-smithy-async.workspace = true
|
||||
either.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
anyhow.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
@@ -40,6 +40,11 @@ impl TimelineAnalysis {
|
||||
garbage_keys: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether a timeline is healthy.
|
||||
pub(crate) fn is_healthy(&self) -> bool {
|
||||
self.errors.is_empty() && self.warnings.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
@@ -87,7 +92,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
.push(format!("index_part.json version: {}", index_part.version()))
|
||||
}
|
||||
|
||||
let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(2);
|
||||
let mut newest_versions = IndexPart::KNOWN_VERSIONS.iter().rev().take(3);
|
||||
if !newest_versions.any(|ip| ip == &index_part.version()) {
|
||||
info!(
|
||||
"index_part.json version is not latest: {}",
|
||||
@@ -167,8 +172,11 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
}
|
||||
}
|
||||
BlobDataParseResult::Relic => {}
|
||||
BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
|
||||
parse_errors
|
||||
BlobDataParseResult::Incorrect {
|
||||
errors,
|
||||
s3_layers: _,
|
||||
} => result.errors.extend(
|
||||
errors
|
||||
.into_iter()
|
||||
.map(|error| format!("parse error: {error}")),
|
||||
),
|
||||
@@ -295,7 +303,10 @@ pub(crate) enum BlobDataParseResult {
|
||||
},
|
||||
/// The remains of a deleted Timeline (i.e. an initdb archive only)
|
||||
Relic,
|
||||
Incorrect(Vec<String>),
|
||||
Incorrect {
|
||||
errors: Vec<String>,
|
||||
s3_layers: HashSet<(LayerName, Generation)>,
|
||||
},
|
||||
}
|
||||
|
||||
pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
|
||||
@@ -438,7 +449,7 @@ pub(crate) async fn list_timeline_blobs(
|
||||
}
|
||||
|
||||
Ok(S3TimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Incorrect(errors),
|
||||
blob_data: BlobDataParseResult::Incorrect { errors, s3_layers },
|
||||
unused_index_keys: index_part_keys,
|
||||
unknown_keys,
|
||||
})
|
||||
|
||||
@@ -19,8 +19,8 @@ use utils::id::TenantId;
|
||||
|
||||
use crate::{
|
||||
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
|
||||
init_remote, init_remote_generic, list_objects_with_retries,
|
||||
metadata_stream::{stream_tenant_timelines, stream_tenants},
|
||||
init_remote_generic, list_objects_with_retries_generic,
|
||||
metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
|
||||
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
|
||||
};
|
||||
|
||||
@@ -153,7 +153,7 @@ async fn find_garbage_inner(
|
||||
node_kind: NodeKind,
|
||||
) -> anyhow::Result<GarbageList> {
|
||||
// Construct clients for S3 and for Console API
|
||||
let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
|
||||
let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
|
||||
let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
|
||||
|
||||
// Build a set of console-known tenants, for quickly eliminating known-active tenants without having
|
||||
@@ -179,7 +179,7 @@ async fn find_garbage_inner(
|
||||
|
||||
// Enumerate Tenants in S3, and check if each one exists in Console
|
||||
tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
|
||||
let tenants = stream_tenants(&s3_client, &target);
|
||||
let tenants = stream_tenants_generic(&remote_client, &target);
|
||||
let tenants_checked = tenants.map_ok(|t| {
|
||||
let api_client = cloud_admin_api_client.clone();
|
||||
let console_cache = console_cache.clone();
|
||||
@@ -237,25 +237,26 @@ async fn find_garbage_inner(
|
||||
// Special case: If it's missing in console, check for known bugs that would enable us to conclusively
|
||||
// identify it as purge-able anyway
|
||||
if console_result.is_none() {
|
||||
let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
|
||||
.await?
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
let timelines =
|
||||
stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
|
||||
.await?
|
||||
.collect::<Vec<_>>()
|
||||
.await;
|
||||
if timelines.is_empty() {
|
||||
// No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
|
||||
let tenant_objects = list_objects_with_retries(
|
||||
&s3_client,
|
||||
let tenant_objects = list_objects_with_retries_generic(
|
||||
&remote_client,
|
||||
ListingMode::WithDelimiter,
|
||||
&target.tenant_root(&tenant_shard_id),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
|
||||
if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
|
||||
let object = tenant_objects.keys.first().unwrap();
|
||||
if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
|
||||
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
|
||||
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
|
||||
continue;
|
||||
} else {
|
||||
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
|
||||
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
|
||||
}
|
||||
} else {
|
||||
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
|
||||
@@ -264,24 +265,18 @@ async fn find_garbage_inner(
|
||||
|
||||
for timeline_r in timelines {
|
||||
let timeline = timeline_r?;
|
||||
let timeline_objects = list_objects_with_retries(
|
||||
&s3_client,
|
||||
let timeline_objects = list_objects_with_retries_generic(
|
||||
&remote_client,
|
||||
ListingMode::WithDelimiter,
|
||||
&target.timeline_root(&timeline),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
if timeline_objects
|
||||
.common_prefixes
|
||||
.as_ref()
|
||||
.map(|v| v.len())
|
||||
.unwrap_or(0)
|
||||
> 0
|
||||
{
|
||||
if !timeline_objects.prefixes.is_empty() {
|
||||
// Sub-paths? Unexpected
|
||||
any_non_initdb = true;
|
||||
} else {
|
||||
let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
|
||||
if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
|
||||
let object = timeline_objects.keys.first().unwrap();
|
||||
if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
|
||||
tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
|
||||
} else {
|
||||
any_non_initdb = true;
|
||||
@@ -336,7 +331,8 @@ async fn find_garbage_inner(
|
||||
|
||||
// Construct a stream of all timelines within active tenants
|
||||
let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
|
||||
let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
|
||||
let timelines =
|
||||
active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
|
||||
let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
|
||||
let timelines = timelines.try_flatten();
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use aws_config::retry::{RetryConfigBuilder, RetryMode};
|
||||
use aws_sdk_s3::config::Region;
|
||||
use aws_sdk_s3::error::DisplayErrorContext;
|
||||
use aws_sdk_s3::Client;
|
||||
@@ -32,6 +33,7 @@ use remote_storage::{
|
||||
};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use storage_controller_client::control_api;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::error;
|
||||
@@ -255,6 +257,12 @@ pub struct ControllerClientConfig {
|
||||
pub controller_jwt: String,
|
||||
}
|
||||
|
||||
impl ControllerClientConfig {
|
||||
pub fn build_client(self) -> control_api::Client {
|
||||
control_api::Client::new(self.controller_api, Some(self.controller_jwt))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ConsoleConfig {
|
||||
pub token: String,
|
||||
pub base_url: Url,
|
||||
@@ -307,8 +315,15 @@ pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
|
||||
}
|
||||
|
||||
async fn init_s3_client(bucket_region: Region) -> Client {
|
||||
let mut retry_config_builder = RetryConfigBuilder::new();
|
||||
|
||||
retry_config_builder
|
||||
.set_max_attempts(Some(3))
|
||||
.set_mode(Some(RetryMode::Adaptive));
|
||||
|
||||
let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
|
||||
.region(bucket_region)
|
||||
.retry_config(retry_config_builder.build())
|
||||
.load()
|
||||
.await;
|
||||
Client::new(&config)
|
||||
@@ -420,6 +435,7 @@ async fn list_objects_with_retries(
|
||||
Err(anyhow!("unreachable unless MAX_RETRIES==0"))
|
||||
}
|
||||
|
||||
/// Listing possibly large amounts of keys in a streaming fashion.
|
||||
fn stream_objects_with_retries<'a>(
|
||||
storage_client: &'a GenericRemoteStorage,
|
||||
listing_mode: ListingMode,
|
||||
@@ -458,6 +474,45 @@ fn stream_objects_with_retries<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
|
||||
/// use [`stream_objects_with_retries`] instead.
|
||||
async fn list_objects_with_retries_generic(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
listing_mode: ListingMode,
|
||||
s3_target: &S3Target,
|
||||
) -> anyhow::Result<Listing> {
|
||||
let cancel = CancellationToken::new();
|
||||
let prefix_str = &s3_target
|
||||
.prefix_in_bucket
|
||||
.strip_prefix("/")
|
||||
.unwrap_or(&s3_target.prefix_in_bucket);
|
||||
let prefix = RemotePath::from_string(prefix_str)?;
|
||||
for trial in 0..MAX_RETRIES {
|
||||
match remote_client
|
||||
.list(Some(&prefix), listing_mode, None, &cancel)
|
||||
.await
|
||||
{
|
||||
Ok(response) => return Ok(response),
|
||||
Err(e) => {
|
||||
if trial == MAX_RETRIES - 1 {
|
||||
return Err(e)
|
||||
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
|
||||
}
|
||||
error!(
|
||||
"list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
|
||||
s3_target.bucket_name,
|
||||
s3_target.prefix_in_bucket,
|
||||
s3_target.delimiter,
|
||||
DisplayErrorContext(e),
|
||||
);
|
||||
let backoff_time = 1 << trial.max(5);
|
||||
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
panic!("MAX_RETRIES is not allowed to be 0");
|
||||
}
|
||||
|
||||
async fn download_object_with_retries(
|
||||
s3_client: &Client,
|
||||
bucket_name: &str,
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use anyhow::{anyhow, bail};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use reqwest::Url;
|
||||
use reqwest::{Method, Url};
|
||||
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||
use storage_scrubber::pageserver_physical_gc::GcMode;
|
||||
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
|
||||
@@ -16,6 +17,11 @@ use storage_scrubber::{
|
||||
use clap::{Parser, Subcommand};
|
||||
use utils::id::TenantId;
|
||||
|
||||
use utils::{project_build_tag, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
@@ -61,6 +67,8 @@ enum Command {
|
||||
json: bool,
|
||||
#[arg(long = "tenant-id", num_args = 0..)]
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
#[arg(long = "post", default_value_t = false)]
|
||||
post_to_storage_controller: bool,
|
||||
#[arg(long, default_value = None)]
|
||||
/// For safekeeper node_kind only, points to db with debug dump
|
||||
dump_db_connstr: Option<String>,
|
||||
@@ -98,6 +106,8 @@ enum Command {
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
tracing::info!("version: {}, build_tag {}", GIT_VERSION, BUILD_TAG);
|
||||
|
||||
let bucket_config = BucketConfig::from_env()?;
|
||||
|
||||
let command_log_name = match &cli.command {
|
||||
@@ -116,11 +126,20 @@ async fn main() -> anyhow::Result<()> {
|
||||
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
|
||||
));
|
||||
|
||||
let controller_client_conf = cli.controller_api.map(|controller_api| {
|
||||
ControllerClientConfig {
|
||||
controller_api,
|
||||
// Default to no key: this is a convenience when working in a development environment
|
||||
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
|
||||
}
|
||||
});
|
||||
|
||||
match cli.command {
|
||||
Command::ScanMetadata {
|
||||
json,
|
||||
tenant_ids,
|
||||
node_kind,
|
||||
post_to_storage_controller,
|
||||
dump_db_connstr,
|
||||
dump_db_table,
|
||||
} => {
|
||||
@@ -159,6 +178,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
if controller_client_conf.is_none() && post_to_storage_controller {
|
||||
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
|
||||
}
|
||||
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
@@ -170,22 +192,37 @@ async fn main() -> anyhow::Result<()> {
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
}
|
||||
|
||||
if post_to_storage_controller {
|
||||
if let Some(conf) = controller_client_conf {
|
||||
let controller_client = conf.build_client();
|
||||
let body = summary.build_health_update_request();
|
||||
controller_client
|
||||
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
|
||||
Method::POST,
|
||||
"control/v1/metadata_health/update".to_string(),
|
||||
Some(body),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
if summary.is_fatal() {
|
||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||
tracing::error!("Fatal scrub errors detected");
|
||||
} else if summary.is_empty() {
|
||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||
// at all then it's likely due to some configuration issues like a bad prefix
|
||||
Err(anyhow::anyhow!(
|
||||
tracing::error!(
|
||||
"No timelines found in bucket {} prefix {}",
|
||||
bucket_config.bucket,
|
||||
bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("<none>".to_string())
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -217,14 +254,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
min_age,
|
||||
mode,
|
||||
} => {
|
||||
let controller_client_conf = cli.controller_api.map(|controller_api| {
|
||||
ControllerClientConfig {
|
||||
controller_api,
|
||||
// Default to no key: this is a convenience when working in a development environment
|
||||
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
|
||||
}
|
||||
});
|
||||
|
||||
match (&controller_client_conf, mode) {
|
||||
(Some(_), _) => {
|
||||
// Any mode may run when controller API is set
|
||||
|
||||
@@ -4,7 +4,7 @@ use anyhow::{anyhow, Context};
|
||||
use async_stream::{stream, try_stream};
|
||||
use aws_sdk_s3::{types::ObjectIdentifier, Client};
|
||||
use futures::StreamExt;
|
||||
use remote_storage::{GenericRemoteStorage, ListingMode};
|
||||
use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePath};
|
||||
use tokio_stream::Stream;
|
||||
|
||||
use crate::{
|
||||
@@ -189,6 +189,63 @@ pub async fn stream_tenant_timelines<'a>(
|
||||
})
|
||||
}
|
||||
|
||||
/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
|
||||
/// using a listing. The listing is done before the stream is built, so that this
|
||||
/// function can be used to generate concurrency on a stream using buffer_unordered.
|
||||
pub async fn stream_tenant_timelines_generic<'a>(
|
||||
remote_client: &'a GenericRemoteStorage,
|
||||
target: &'a RootTarget,
|
||||
tenant: TenantShardId,
|
||||
) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
|
||||
let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
|
||||
let timelines_target = target.timelines_root(&tenant);
|
||||
|
||||
let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
|
||||
remote_client,
|
||||
ListingMode::WithDelimiter,
|
||||
&timelines_target
|
||||
));
|
||||
loop {
|
||||
tracing::debug!("Listing in {tenant}");
|
||||
let fetch_response = match objects_stream.next().await {
|
||||
None => break,
|
||||
Some(Err(e)) => {
|
||||
timeline_ids.push(Err(e));
|
||||
break;
|
||||
}
|
||||
Some(Ok(r)) => r,
|
||||
};
|
||||
|
||||
let new_entry_ids = fetch_response
|
||||
.prefixes
|
||||
.iter()
|
||||
.filter_map(|prefix| -> Option<&str> {
|
||||
prefix
|
||||
.get_path()
|
||||
.as_str()
|
||||
.strip_prefix(&timelines_target.prefix_in_bucket)?
|
||||
.strip_suffix('/')
|
||||
})
|
||||
.map(|entry_id_str| {
|
||||
entry_id_str
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
|
||||
});
|
||||
|
||||
for i in new_entry_ids {
|
||||
timeline_ids.push(i);
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!("Yielding for {}", tenant);
|
||||
Ok(stream! {
|
||||
for i in timeline_ids {
|
||||
let id = i?;
|
||||
yield Ok(TenantShardTimelineId::new(tenant, id));
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn stream_listing<'a>(
|
||||
s3_client: &'a Client,
|
||||
target: &'a S3Target,
|
||||
@@ -219,3 +276,33 @@ pub(crate) fn stream_listing<'a>(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn stream_listing_generic<'a>(
|
||||
remote_client: &'a GenericRemoteStorage,
|
||||
target: &'a S3Target,
|
||||
) -> impl Stream<Item = anyhow::Result<(RemotePath, Option<ListingObject>)>> + 'a {
|
||||
let listing_mode = if target.delimiter.is_empty() {
|
||||
ListingMode::NoDelimiter
|
||||
} else {
|
||||
ListingMode::WithDelimiter
|
||||
};
|
||||
try_stream! {
|
||||
let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
|
||||
remote_client,
|
||||
listing_mode,
|
||||
target,
|
||||
));
|
||||
while let Some(list) = objects_stream.next().await {
|
||||
let list = list?;
|
||||
if target.delimiter.is_empty() {
|
||||
for key in list.keys {
|
||||
yield (key.key.clone(), Some(key));
|
||||
}
|
||||
} else {
|
||||
for key in list.prefixes {
|
||||
yield (key, None);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,10 +389,13 @@ async fn gc_ancestor(
|
||||
// Post-deletion tenant location: don't try and GC it.
|
||||
continue;
|
||||
}
|
||||
BlobDataParseResult::Incorrect(reasons) => {
|
||||
BlobDataParseResult::Incorrect {
|
||||
errors,
|
||||
s3_layers: _, // TODO(yuchen): could still check references to these s3 layers?
|
||||
} => {
|
||||
// Our primary purpose isn't to report on bad data, but log this rather than skipping silently
|
||||
tracing::warn!(
|
||||
"Skipping ancestor GC for timeline {ttid}, bad metadata: {reasons:?}"
|
||||
"Skipping ancestor GC for timeline {ttid}, bad metadata: {errors:?}"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
@@ -518,9 +521,12 @@ pub async fn pageserver_physical_gc(
|
||||
// Post-deletion tenant location: don't try and GC it.
|
||||
return Ok(summary);
|
||||
}
|
||||
BlobDataParseResult::Incorrect(reasons) => {
|
||||
BlobDataParseResult::Incorrect {
|
||||
errors,
|
||||
s3_layers: _,
|
||||
} => {
|
||||
// Our primary purpose isn't to report on bad data, but log this rather than skipping silently
|
||||
tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}");
|
||||
tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
|
||||
return Ok(summary);
|
||||
}
|
||||
};
|
||||
@@ -567,13 +573,7 @@ pub async fn pageserver_physical_gc(
|
||||
}
|
||||
|
||||
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
|
||||
let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
|
||||
let ControllerClientConfig {
|
||||
controller_api,
|
||||
controller_jwt,
|
||||
} = c;
|
||||
control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
|
||||
}) else {
|
||||
let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
|
||||
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
|
||||
return Ok(summary);
|
||||
};
|
||||
|
||||
@@ -9,12 +9,13 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
|
||||
use aws_sdk_s3::Client;
|
||||
use futures_util::{StreamExt, TryStreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::remote_layer_path;
|
||||
use pageserver_api::controller_api::MetadataHealthUpdateRequest;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use serde::Serialize;
|
||||
use utils::id::TenantId;
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct MetadataSummary {
|
||||
tenant_count: usize,
|
||||
timeline_count: usize,
|
||||
@@ -23,19 +24,16 @@ pub struct MetadataSummary {
|
||||
with_warnings: HashSet<TenantShardTimelineId>,
|
||||
with_orphans: HashSet<TenantShardTimelineId>,
|
||||
indices_by_version: HashMap<usize, usize>,
|
||||
|
||||
#[serde(skip)]
|
||||
pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
|
||||
#[serde(skip)]
|
||||
pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
|
||||
}
|
||||
|
||||
impl MetadataSummary {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
tenant_count: 0,
|
||||
timeline_count: 0,
|
||||
timeline_shard_count: 0,
|
||||
with_errors: HashSet::new(),
|
||||
with_warnings: HashSet::new(),
|
||||
with_orphans: HashSet::new(),
|
||||
indices_by_version: HashMap::new(),
|
||||
}
|
||||
Self::default()
|
||||
}
|
||||
|
||||
fn update_data(&mut self, data: &S3TimelineBlobData) {
|
||||
@@ -54,6 +52,13 @@ impl MetadataSummary {
|
||||
}
|
||||
|
||||
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
|
||||
if analysis.is_healthy() {
|
||||
self.healthy_tenant_shards.insert(id.tenant_shard_id);
|
||||
} else {
|
||||
self.healthy_tenant_shards.remove(&id.tenant_shard_id);
|
||||
self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
|
||||
}
|
||||
|
||||
if !analysis.errors.is_empty() {
|
||||
self.with_errors.insert(*id);
|
||||
}
|
||||
@@ -101,6 +106,13 @@ Index versions: {version_summary}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.timeline_shard_count == 0
|
||||
}
|
||||
|
||||
pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
|
||||
MetadataHealthUpdateRequest {
|
||||
healthy_tenant_shards: self.healthy_tenant_shards.clone(),
|
||||
unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
||||
@@ -278,13 +290,21 @@ pub async fn scan_metadata(
|
||||
}
|
||||
}
|
||||
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part: _index_part,
|
||||
index_part_generation: _index_part_generation,
|
||||
s3_layers,
|
||||
} = &data.blob_data
|
||||
{
|
||||
tenant_objects.push(ttid, s3_layers.clone());
|
||||
match &data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
index_part: _index_part,
|
||||
index_part_generation: _index_part_generation,
|
||||
s3_layers,
|
||||
} => {
|
||||
tenant_objects.push(ttid, s3_layers.clone());
|
||||
}
|
||||
BlobDataParseResult::Relic => (),
|
||||
BlobDataParseResult::Incorrect {
|
||||
errors: _,
|
||||
s3_layers,
|
||||
} => {
|
||||
tenant_objects.push(ttid, s3_layers.clone());
|
||||
}
|
||||
}
|
||||
tenant_timeline_results.push((ttid, data));
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::{collections::HashSet, str::FromStr, sync::Arc};
|
||||
|
||||
use aws_sdk_s3::Client;
|
||||
use futures::stream::{StreamExt, TryStreamExt};
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use postgres_ffi::{XLogFileName, PG_TLI};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde::Serialize;
|
||||
use tokio_postgres::types::PgLsn;
|
||||
use tracing::{error, info, trace};
|
||||
@@ -14,8 +14,9 @@ use utils::{
|
||||
};
|
||||
|
||||
use crate::{
|
||||
cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing,
|
||||
BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId,
|
||||
cloud_admin_api::CloudAdminApiClient, init_remote_generic,
|
||||
metadata_stream::stream_listing_generic, BucketConfig, ConsoleConfig, NodeKind, RootTarget,
|
||||
TenantShardTimelineId,
|
||||
};
|
||||
|
||||
/// Generally we should ask safekeepers, but so far we use everywhere default 16MB.
|
||||
@@ -106,7 +107,7 @@ pub async fn scan_safekeeper_metadata(
|
||||
let timelines = client.query(&query, &[]).await?;
|
||||
info!("loaded {} timelines", timelines.len());
|
||||
|
||||
let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
|
||||
let (remote_client, target) = init_remote_generic(bucket_config, NodeKind::Safekeeper).await?;
|
||||
let console_config = ConsoleConfig::from_env()?;
|
||||
let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
|
||||
|
||||
@@ -119,7 +120,7 @@ pub async fn scan_safekeeper_metadata(
|
||||
let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg));
|
||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
check_timeline(
|
||||
&s3_client,
|
||||
&remote_client,
|
||||
&target,
|
||||
&cloud_admin_api_client,
|
||||
ttid,
|
||||
@@ -156,7 +157,7 @@ struct TimelineCheckResult {
|
||||
/// errors are logged to stderr; returns Ok(true) if timeline is consistent,
|
||||
/// Ok(false) if not, Err if failed to check.
|
||||
async fn check_timeline(
|
||||
s3_client: &Client,
|
||||
remote_client: &GenericRemoteStorage,
|
||||
root: &RootTarget,
|
||||
api_client: &CloudAdminApiClient,
|
||||
ttid: TenantTimelineId,
|
||||
@@ -187,12 +188,13 @@ async fn check_timeline(
|
||||
// we need files, so unset it.
|
||||
timeline_dir_target.delimiter = String::new();
|
||||
|
||||
let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target));
|
||||
let mut stream = std::pin::pin!(stream_listing_generic(remote_client, &timeline_dir_target));
|
||||
while let Some(obj) = stream.next().await {
|
||||
let obj = obj?;
|
||||
let key = obj.key();
|
||||
let (key, _obj) = obj?;
|
||||
|
||||
let seg_name = key
|
||||
.get_path()
|
||||
.as_str()
|
||||
.strip_prefix(&timeline_dir_target.prefix_in_bucket)
|
||||
.expect("failed to extract segment name");
|
||||
expected_segfiles.remove(seg_name);
|
||||
|
||||
@@ -269,7 +269,7 @@ impl SnapshotDownloader {
|
||||
.context("Downloading timeline")?;
|
||||
}
|
||||
BlobDataParseResult::Relic => {}
|
||||
BlobDataParseResult::Incorrect(_) => {
|
||||
BlobDataParseResult::Incorrect { .. } => {
|
||||
tracing::error!("Bad metadata in timeline {ttid}");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -150,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_pitr_history_size",
|
||||
"pageserver_layer_bytes",
|
||||
"pageserver_layer_count",
|
||||
"pageserver_visible_physical_size",
|
||||
"pageserver_storage_operations_seconds_count_total",
|
||||
"pageserver_storage_operations_seconds_sum_total",
|
||||
"pageserver_evictions_total",
|
||||
|
||||
@@ -285,9 +285,9 @@ class NeonApiEndpoint:
|
||||
self.project_id = project_id
|
||||
eps = neon_api.get_endpoints(project_id)["endpoints"]
|
||||
self.endpoint_id = eps[0]["id"]
|
||||
self.connstr = neon_api.get_connection_uri(project_id, endpoint_id=self.endpoint_id)[
|
||||
"uri"
|
||||
]
|
||||
self.connstr = neon_api.get_connection_uri(
|
||||
project_id, endpoint_id=self.endpoint_id, pooled=False
|
||||
)["uri"]
|
||||
pw = self.connstr.split("@")[0].split(":")[-1]
|
||||
self.pgbench_env = {
|
||||
"PGHOST": eps[0]["host"],
|
||||
|
||||
@@ -978,7 +978,10 @@ class NeonEnvBuilder:
|
||||
and self.enable_scrub_on_exit
|
||||
):
|
||||
try:
|
||||
self.env.storage_scrubber.scan_metadata()
|
||||
healthy, _ = self.env.storage_scrubber.scan_metadata()
|
||||
if not healthy:
|
||||
e = Exception("Remote storage metadata corrupted")
|
||||
cleanup_error = e
|
||||
except Exception as e:
|
||||
log.error(f"Error during remote storage scrub: {e}")
|
||||
cleanup_error = e
|
||||
@@ -1943,11 +1946,15 @@ class NeonCli(AbstractNeonCli):
|
||||
remote_ext_config: Optional[str] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
allow_multiple=False,
|
||||
basebackup_request_tries: Optional[int] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
args = [
|
||||
"endpoint",
|
||||
"start",
|
||||
]
|
||||
extra_env_vars = {}
|
||||
if basebackup_request_tries is not None:
|
||||
extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries)
|
||||
if remote_ext_config is not None:
|
||||
args.extend(["--remote-ext-config", remote_ext_config])
|
||||
|
||||
@@ -1960,7 +1967,7 @@ class NeonCli(AbstractNeonCli):
|
||||
if allow_multiple:
|
||||
args.extend(["--allow-multiple"])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res = self.raw_cli(args, extra_env_vars)
|
||||
res.check_returncode()
|
||||
return res
|
||||
|
||||
@@ -3812,6 +3819,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
pageserver_id: Optional[int] = None,
|
||||
safekeepers: Optional[List[int]] = None,
|
||||
allow_multiple: bool = False,
|
||||
basebackup_request_tries: Optional[int] = None,
|
||||
) -> "Endpoint":
|
||||
"""
|
||||
Start the Postgres instance.
|
||||
@@ -3833,6 +3841,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
)
|
||||
self._running.release(1)
|
||||
|
||||
@@ -3979,6 +3988,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
remote_ext_config: Optional[str] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
allow_multiple=False,
|
||||
basebackup_request_tries: Optional[int] = None,
|
||||
) -> "Endpoint":
|
||||
"""
|
||||
Create an endpoint, apply config, and start Postgres.
|
||||
@@ -3999,6 +4009,7 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
)
|
||||
|
||||
log.info(f"Postgres startup took {time.time() - started_at} seconds")
|
||||
@@ -4042,6 +4053,7 @@ class EndpointFactory:
|
||||
config_lines: Optional[List[str]] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
pageserver_id: Optional[int] = None,
|
||||
basebackup_request_tries: Optional[int] = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
self.env,
|
||||
@@ -4060,6 +4072,7 @@ class EndpointFactory:
|
||||
lsn=lsn,
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
basebackup_request_tries=basebackup_request_tries,
|
||||
)
|
||||
|
||||
def create(
|
||||
@@ -4401,13 +4414,19 @@ class StorageScrubber:
|
||||
assert stdout is not None
|
||||
return stdout
|
||||
|
||||
def scan_metadata(self) -> Any:
|
||||
stdout = self.scrubber_cli(
|
||||
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
|
||||
)
|
||||
def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]:
|
||||
"""
|
||||
Returns the health status and the metadata summary.
|
||||
"""
|
||||
args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
|
||||
if post_to_storage_controller:
|
||||
args.append("--post")
|
||||
stdout = self.scrubber_cli(args, timeout=30)
|
||||
|
||||
try:
|
||||
return json.loads(stdout)
|
||||
summary = json.loads(stdout)
|
||||
healthy = not summary["with_errors"] and not summary["with_warnings"]
|
||||
return healthy, summary
|
||||
except:
|
||||
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
|
||||
log.error(stdout)
|
||||
@@ -4528,6 +4547,13 @@ def test_output_dir(
|
||||
|
||||
yield test_dir
|
||||
|
||||
# Allure artifacts creation might involve the creation of `.tar.zst` archives,
|
||||
# which aren't going to be used if Allure results collection is not enabled
|
||||
# (i.e. --alluredir is not set).
|
||||
# Skip `allure_attach_from_dir` in this case
|
||||
if not request.config.getoption("--alluredir"):
|
||||
return
|
||||
|
||||
preserve_database_files = False
|
||||
for k, v in request.node.user_properties:
|
||||
# NB: the neon_env_builder fixture uses this fixture (test_output_dir).
|
||||
|
||||
@@ -61,6 +61,7 @@ class HistoricLayerInfo:
|
||||
remote: bool
|
||||
# None for image layers, true if pageserver thinks this is an L0 delta layer
|
||||
l0: Optional[bool]
|
||||
visible: bool
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
|
||||
@@ -79,6 +80,7 @@ class HistoricLayerInfo:
|
||||
lsn_end=d.get("lsn_end"),
|
||||
remote=d["remote"],
|
||||
l0=l0_ness,
|
||||
visible=d["access_stats"]["visible"],
|
||||
)
|
||||
|
||||
|
||||
@@ -556,6 +558,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_block_gc(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/block_gc",
|
||||
)
|
||||
log.info(f"Got GC request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
|
||||
def timeline_unblock_gc(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
|
||||
):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/unblock_gc",
|
||||
)
|
||||
log.info(f"Got GC request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
|
||||
def timeline_compact(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
@@ -663,6 +681,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
force_image_layer_creation=False,
|
||||
wait_until_uploaded=False,
|
||||
compact: Optional[bool] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
query = {}
|
||||
@@ -680,6 +699,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",
|
||||
params=query,
|
||||
**kwargs,
|
||||
)
|
||||
log.info(f"Got checkpoint request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
|
||||
@@ -389,7 +389,10 @@ WaitUntilRet = TypeVar("WaitUntilRet")
|
||||
|
||||
|
||||
def wait_until(
|
||||
number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
|
||||
number_of_iterations: int,
|
||||
interval: float,
|
||||
func: Callable[[], WaitUntilRet],
|
||||
show_intermediate_error=False,
|
||||
) -> WaitUntilRet:
|
||||
"""
|
||||
Wait until 'func' returns successfully, without exception. Returns the
|
||||
@@ -402,6 +405,8 @@ def wait_until(
|
||||
except Exception as e:
|
||||
log.info("waiting for %s iteration %s failed", func, i + 1)
|
||||
last_exception = e
|
||||
if show_intermediate_error:
|
||||
log.info(e)
|
||||
time.sleep(interval)
|
||||
continue
|
||||
return res
|
||||
|
||||
22
test_runner/logical_repl/README.md
Normal file
22
test_runner/logical_repl/README.md
Normal file
@@ -0,0 +1,22 @@
|
||||
# Logical replication tests
|
||||
|
||||
## Clickhouse
|
||||
|
||||
```bash
|
||||
export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
|
||||
|
||||
docker compose -f clickhouse/docker-compose.yml up -d
|
||||
pytest -m remote_cluster -k test_clickhouse
|
||||
docker compose -f clickhouse/docker-compose.yml down
|
||||
```
|
||||
|
||||
## Debezium
|
||||
|
||||
```bash
|
||||
export BENCHMARK_CONNSTR=postgres://user:pass@ep-abc-xyz-123.us-east-2.aws.neon.build/neondb
|
||||
|
||||
docker compose -f debezium/docker-compose.yml up -d
|
||||
pytest -m remote_cluster -k test_debezium
|
||||
docker compose -f debezium/docker-compose.yml down
|
||||
|
||||
```
|
||||
9
test_runner/logical_repl/clickhouse/docker-compose.yml
Normal file
9
test_runner/logical_repl/clickhouse/docker-compose.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
services:
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server
|
||||
user: "101:101"
|
||||
container_name: clickhouse
|
||||
hostname: clickhouse
|
||||
ports:
|
||||
- 127.0.0.1:8123:8123
|
||||
- 127.0.0.1:9000:9000
|
||||
24
test_runner/logical_repl/debezium/docker-compose.yml
Normal file
24
test_runner/logical_repl/debezium/docker-compose.yml
Normal file
@@ -0,0 +1,24 @@
|
||||
services:
|
||||
zookeeper:
|
||||
image: quay.io/debezium/zookeeper:2.7
|
||||
kafka:
|
||||
image: quay.io/debezium/kafka:2.7
|
||||
environment:
|
||||
ZOOKEEPER_CONNECT: "zookeeper:2181"
|
||||
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
|
||||
KAFKA_BROKER_ID: 1
|
||||
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
|
||||
KAFKA_JMX_PORT: 9991
|
||||
ports:
|
||||
- 127.0.0.1:9092:9092
|
||||
debezium:
|
||||
image: quay.io/debezium/connect:2.7
|
||||
environment:
|
||||
BOOTSTRAP_SERVERS: kafka:9092
|
||||
GROUP_ID: 1
|
||||
CONFIG_STORAGE_TOPIC: debezium-config
|
||||
OFFSET_STORAGE_TOPIC: debezium-offset
|
||||
STATUS_STORAGE_TOPIC: debezium-status
|
||||
DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
|
||||
ports:
|
||||
- 127.0.0.1:8083:8083
|
||||
@@ -1,8 +1,9 @@
|
||||
"""
|
||||
Test the logical replication in Neon with the different consumers
|
||||
Test the logical replication in Neon with ClickHouse as a consumer
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import time
|
||||
|
||||
import clickhouse_connect
|
||||
@@ -39,22 +40,15 @@ def test_clickhouse(remote_pg: RemotePostgres):
|
||||
"""
|
||||
Test the logical replication having ClickHouse as a client
|
||||
"""
|
||||
clickhouse_host = "clickhouse" if ("CI" in os.environ) else "127.0.0.1"
|
||||
conn_options = remote_pg.conn_options()
|
||||
for _ in range(5):
|
||||
try:
|
||||
conn = psycopg2.connect(remote_pg.connstr())
|
||||
except psycopg2.OperationalError as perr:
|
||||
log.debug(perr)
|
||||
time.sleep(1)
|
||||
else:
|
||||
break
|
||||
raise TimeoutError
|
||||
conn = psycopg2.connect(remote_pg.connstr())
|
||||
cur = conn.cursor()
|
||||
cur.execute("DROP TABLE IF EXISTS table1")
|
||||
cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
|
||||
cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
|
||||
conn.commit()
|
||||
client = clickhouse_connect.get_client(host="clickhouse")
|
||||
client = clickhouse_connect.get_client(host=clickhouse_host)
|
||||
client.command("SET allow_experimental_database_materialized_postgresql=1")
|
||||
client.command(
|
||||
"CREATE DATABASE db1_postgres ENGINE = "
|
||||
189
test_runner/logical_repl/test_debezium.py
Normal file
189
test_runner/logical_repl/test_debezium.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""
|
||||
Test the logical replication in Neon with Debezium as a consumer
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
import psycopg2
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import RemotePostgres
|
||||
from fixtures.utils import wait_until
|
||||
from kafka import KafkaConsumer
|
||||
|
||||
|
||||
class DebeziumAPI:
|
||||
"""
|
||||
The class for Debezium API calls
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.__host = "debezium" if ("CI" in os.environ) else "127.0.0.1"
|
||||
self.__base_url = f"http://{self.__host}:8083"
|
||||
self.__connectors_url = f"{self.__base_url}/connectors"
|
||||
|
||||
def __request(self, method, addurl="", **kwargs):
|
||||
return requests.request(
|
||||
method,
|
||||
self.__connectors_url + addurl,
|
||||
headers={"Accept": "application/json", "Content-type": "application/json"},
|
||||
timeout=60,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def create_pg_connector(self, remote_pg: RemotePostgres, dbz_conn_name: str):
|
||||
"""
|
||||
Create a Postgres connector in debezium
|
||||
"""
|
||||
conn_options = remote_pg.conn_options()
|
||||
payload = {
|
||||
"name": dbz_conn_name,
|
||||
"config": {
|
||||
"connector.class": "io.debezium.connector.postgresql.PostgresConnector",
|
||||
"tasks.max": "1",
|
||||
"database.hostname": conn_options["host"],
|
||||
"database.port": "5432",
|
||||
"database.user": conn_options["user"],
|
||||
"database.password": conn_options["password"],
|
||||
"database.dbname": conn_options["dbname"],
|
||||
"plugin.name": "pgoutput",
|
||||
"topic.prefix": "dbserver1",
|
||||
"schema.include.list": "inventory",
|
||||
},
|
||||
}
|
||||
return self.__request("POST", json=payload)
|
||||
|
||||
def list_connectors(self):
|
||||
"""
|
||||
Returns a list of all connectors existent in Debezium.
|
||||
"""
|
||||
resp = self.__request("GET")
|
||||
assert resp.ok
|
||||
return json.loads(resp.text)
|
||||
|
||||
def del_connector(self, connector):
|
||||
"""
|
||||
Deletes the specified connector
|
||||
"""
|
||||
return self.__request("DELETE", f"/{connector}")
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def debezium(remote_pg: RemotePostgres):
|
||||
"""
|
||||
Prepare the Debezium API handler, connection
|
||||
"""
|
||||
conn = psycopg2.connect(remote_pg.connstr())
|
||||
cur = conn.cursor()
|
||||
cur.execute("DROP SCHEMA IF EXISTS inventory CASCADE")
|
||||
cur.execute("CREATE SCHEMA inventory")
|
||||
cur.execute(
|
||||
"CREATE TABLE inventory.customers ("
|
||||
"id SERIAL NOT NULL PRIMARY KEY,"
|
||||
"first_name character varying(255) NOT NULL,"
|
||||
"last_name character varying(255) NOT NULL,"
|
||||
"email character varying(255) NOT NULL)"
|
||||
)
|
||||
conn.commit()
|
||||
dbz = DebeziumAPI()
|
||||
assert len(dbz.list_connectors()) == 0
|
||||
dbz_conn_name = "inventory-connector"
|
||||
resp = dbz.create_pg_connector(remote_pg, dbz_conn_name)
|
||||
log.debug("%s %s %s", resp.status_code, resp.ok, resp.text)
|
||||
assert resp.status_code == 201
|
||||
assert len(dbz.list_connectors()) == 1
|
||||
consumer = KafkaConsumer(
|
||||
"dbserver1.inventory.customers",
|
||||
bootstrap_servers=["kafka:9092"],
|
||||
auto_offset_reset="earliest",
|
||||
enable_auto_commit=False,
|
||||
)
|
||||
yield conn, consumer
|
||||
resp = dbz.del_connector(dbz_conn_name)
|
||||
assert resp.status_code == 204
|
||||
|
||||
|
||||
def get_kafka_msg(consumer, ts_ms, before=None, after=None) -> None:
|
||||
"""
|
||||
Gets the message from Kafka and checks its validity
|
||||
Arguments:
|
||||
consumer: the consumer object
|
||||
ts_ms: timestamp in milliseconds of the change of db, the corresponding message must have
|
||||
the later timestamp
|
||||
before: a dictionary, if not None, the before field from the kafka message must
|
||||
have the same values for the same keys
|
||||
after: a dictionary, if not None, the after field from the kafka message must
|
||||
have the same values for the same keys
|
||||
"""
|
||||
msg = consumer.poll()
|
||||
assert msg, "Empty message"
|
||||
for val in msg.values():
|
||||
r = json.loads(val[-1].value)
|
||||
log.info(r["payload"])
|
||||
assert ts_ms < r["payload"]["ts_ms"], "Incorrect timestamp"
|
||||
for param, pname in ((before, "before"), (after, "after")):
|
||||
if param is not None:
|
||||
for k, v in param.items():
|
||||
assert r["payload"][pname][k] == v, f"{pname} mismatches"
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
def test_debezium(debezium):
|
||||
"""
|
||||
Test the logical replication having Debezium as a subscriber
|
||||
"""
|
||||
conn, consumer = debezium
|
||||
cur = conn.cursor()
|
||||
ts_ms = time.time() * 1000
|
||||
log.info("Insert 1 ts_ms: %s", ts_ms)
|
||||
cur.execute(
|
||||
"insert into inventory.customers (first_name, last_name, email) "
|
||||
"values ('John', 'Dow','johndow@example.com')"
|
||||
)
|
||||
conn.commit()
|
||||
wait_until(
|
||||
100,
|
||||
0.5,
|
||||
lambda: get_kafka_msg(
|
||||
consumer,
|
||||
ts_ms,
|
||||
after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
|
||||
),
|
||||
show_intermediate_error=True,
|
||||
)
|
||||
ts_ms = time.time() * 1000
|
||||
log.info("Insert 2 ts_ms: %s", ts_ms)
|
||||
cur.execute(
|
||||
"insert into inventory.customers (first_name, last_name, email) "
|
||||
"values ('Alex', 'Row','alexrow@example.com')"
|
||||
)
|
||||
conn.commit()
|
||||
wait_until(
|
||||
100,
|
||||
0.5,
|
||||
lambda: get_kafka_msg(
|
||||
consumer,
|
||||
ts_ms,
|
||||
after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
|
||||
),
|
||||
show_intermediate_error=True,
|
||||
)
|
||||
ts_ms = time.time() * 1000
|
||||
log.info("Update ts_ms: %s", ts_ms)
|
||||
cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
|
||||
conn.commit()
|
||||
wait_until(
|
||||
100,
|
||||
0.5,
|
||||
lambda: get_kafka_msg(
|
||||
consumer,
|
||||
ts_ms,
|
||||
after={"first_name": "Alexander"},
|
||||
),
|
||||
show_intermediate_error=True,
|
||||
)
|
||||
time.sleep(3)
|
||||
cur.execute("select 1")
|
||||
@@ -6,21 +6,8 @@ from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
@pytest.mark.timeout(10000)
|
||||
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
||||
"""
|
||||
Test that GC is able to collect all old layers even if them are forming
|
||||
"stairs" and there are not three delta layers since last image layer.
|
||||
|
||||
Information about image layers needed to collect old layers should
|
||||
be propagated by GC to compaction task which should take in in account
|
||||
when make a decision which new image layers needs to be created.
|
||||
|
||||
NB: this test demonstrates the problem. The source tree contained the
|
||||
`gc_feedback` mechanism for about 9 months, but, there were problems
|
||||
with it and it wasn't enabled at runtime.
|
||||
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
|
||||
"""
|
||||
def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
|
||||
assert mode == "normal" or mode == "with_snapshots"
|
||||
env = neon_env_builder.init_start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
@@ -74,6 +61,9 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
|
||||
|
||||
physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
|
||||
log.info(f"Physical storage size {physical_size}")
|
||||
if mode == "with_snapshots":
|
||||
if step == n_steps / 2:
|
||||
env.neon_cli.create_branch("child")
|
||||
|
||||
max_num_of_deltas_above_image = 0
|
||||
max_total_num_of_deltas = 0
|
||||
@@ -149,3 +139,37 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
|
||||
log.info(f"Writing layer map to {layer_map_path}")
|
||||
with layer_map_path.open("w") as f:
|
||||
f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
|
||||
|
||||
|
||||
@pytest.mark.timeout(10000)
|
||||
def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
||||
"""
|
||||
Test that GC is able to collect all old layers even if them are forming
|
||||
"stairs" and there are not three delta layers since last image layer.
|
||||
|
||||
Information about image layers needed to collect old layers should
|
||||
be propagated by GC to compaction task which should take in in account
|
||||
when make a decision which new image layers needs to be created.
|
||||
|
||||
NB: this test demonstrates the problem. The source tree contained the
|
||||
`gc_feedback` mechanism for about 9 months, but, there were problems
|
||||
with it and it wasn't enabled at runtime.
|
||||
This PR removed the code: https://github.com/neondatabase/neon/pull/6863
|
||||
|
||||
And the bottom-most GC-compaction epic resolves the problem.
|
||||
https://github.com/neondatabase/neon/issues/8002
|
||||
"""
|
||||
gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
|
||||
|
||||
|
||||
@pytest.mark.timeout(10000)
|
||||
def test_gc_feedback_with_snapshots(
|
||||
neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
|
||||
):
|
||||
"""
|
||||
Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
|
||||
of the benchmark, and the bottom-most compaction should collect as much garbage as possible below the GC
|
||||
horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
|
||||
and images covering the full key range (in a delta layer) at the GC horizon.
|
||||
"""
|
||||
gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")
|
||||
|
||||
@@ -100,24 +100,32 @@ def test_subscriber_lag(
|
||||
pub_connstr = benchmark_project_pub.connstr
|
||||
sub_connstr = benchmark_project_sub.connstr
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
|
||||
if benchmark_project_pub.is_new:
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
|
||||
if benchmark_project_sub.is_new:
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
|
||||
|
||||
pub_conn = psycopg2.connect(pub_connstr)
|
||||
sub_conn = psycopg2.connect(sub_connstr)
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
if benchmark_project_pub.is_new:
|
||||
pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
|
||||
pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
|
||||
pub_exists = len(pub_cur.fetchall()) != 0
|
||||
|
||||
if benchmark_project_sub.is_new:
|
||||
if not pub_exists:
|
||||
pub_cur.execute("CREATE PUBLICATION pub1 FOR TABLE pgbench_accounts, pgbench_history")
|
||||
|
||||
sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
|
||||
sub_exists = len(sub_cur.fetchall()) != 0
|
||||
if not sub_exists:
|
||||
sub_cur.execute("truncate table pgbench_accounts")
|
||||
sub_cur.execute("truncate table pgbench_history")
|
||||
|
||||
sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
|
||||
sub_cur.execute(f"CREATE SUBSCRIPTION sub1 CONNECTION '{pub_connstr}' PUBLICATION pub1")
|
||||
|
||||
initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
|
||||
|
||||
pub_conn.close()
|
||||
sub_conn.close()
|
||||
|
||||
@@ -195,10 +203,15 @@ def test_publisher_restart(
|
||||
pub_conn.autocommit = True
|
||||
sub_conn.autocommit = True
|
||||
with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
|
||||
if benchmark_project_pub.is_new:
|
||||
pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
|
||||
pub_exists = len(pub_cur.fetchall()) != 0
|
||||
|
||||
if not pub_exists:
|
||||
pub_cur.execute("create publication pub1 for table pgbench_accounts, pgbench_history")
|
||||
|
||||
if benchmark_project_sub.is_new:
|
||||
sub_cur.execute("SELECT 1 FROM pg_catalog.pg_subscription WHERE subname = 'sub1'")
|
||||
sub_exists = len(sub_cur.fetchall()) != 0
|
||||
if not sub_exists:
|
||||
sub_cur.execute("truncate table pgbench_accounts")
|
||||
sub_cur.execute("truncate table pgbench_history")
|
||||
|
||||
|
||||
@@ -217,7 +217,11 @@ def test_storage_controller_many_tenants(
|
||||
# A reconciler operation: migrate a shard.
|
||||
shard_number = rng.randint(0, shard_count - 1)
|
||||
tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
|
||||
dest_ps_id = rng.choice([ps.id for ps in env.pageservers])
|
||||
|
||||
# Migrate it to its secondary location
|
||||
desc = env.storage_controller.tenant_describe(tenant_id)
|
||||
dest_ps_id = desc["shards"][shard_number]["node_secondary"][0]
|
||||
|
||||
f = executor.submit(
|
||||
env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
|
||||
)
|
||||
@@ -231,7 +235,11 @@ def test_storage_controller_many_tenants(
|
||||
for f in futs:
|
||||
f.result()
|
||||
|
||||
# Consistency check is safe here: all the previous operations waited for reconcile before completing
|
||||
# Some of the operations above (notably migrations) might leave the controller in a state where it has
|
||||
# some work to do, for example optimizing shard placement after we do a random migration. Wait for the system
|
||||
# to reach a quiescent state before doing following checks.
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
check_memory()
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ from fixtures.pageserver.utils import wait_until_tenant_active
|
||||
from fixtures.utils import query_scalar
|
||||
from performance.test_perf_pgbench import get_scales_matrix
|
||||
from requests import RequestException
|
||||
from requests.exceptions import RetryError
|
||||
|
||||
|
||||
# Test branch creation
|
||||
@@ -151,7 +150,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
|
||||
".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading",
|
||||
".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline",
|
||||
]
|
||||
)
|
||||
ps_http = env.pageserver.http_client()
|
||||
@@ -176,10 +175,12 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
|
||||
|
||||
env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
with pytest.raises(RuntimeError, match="is not active, state: Loading"):
|
||||
env.endpoints.create_start(initial_branch, tenant_id=env.initial_tenant)
|
||||
with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
|
||||
env.endpoints.create_start(
|
||||
initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
|
||||
)
|
||||
ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
|
||||
finally:
|
||||
# FIXME: paused uploads bother shutdown
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
t.join()
|
||||
@@ -193,8 +194,11 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
|
||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: .*Cannot branch off the timeline that's not present in pageserver.*",
|
||||
]
|
||||
)
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
@@ -216,7 +220,10 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
|
||||
|
||||
branch_id = TimelineId.generate()
|
||||
|
||||
with pytest.raises(RetryError, match="too many 503 error responses"):
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match="Cannot branch off the timeline that's not present in pageserver",
|
||||
):
|
||||
ps_http.timeline_create(
|
||||
env.pg_version,
|
||||
env.initial_tenant,
|
||||
|
||||
@@ -3,18 +3,15 @@ import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
import toml
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.common_types import Lsn, TenantId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
)
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
@@ -22,7 +19,8 @@ from fixtures.pageserver.utils import (
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage
|
||||
from fixtures.workload import Workload
|
||||
|
||||
#
|
||||
# A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
|
||||
@@ -409,3 +407,132 @@ def dump_differs(
|
||||
break
|
||||
|
||||
return differs
|
||||
|
||||
|
||||
@dataclass
|
||||
class HistoricDataSet:
|
||||
name: str
|
||||
tenant_id: TenantId
|
||||
pg_version: PgVersion
|
||||
url: str
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
|
||||
HISTORIC_DATA_SETS = [
|
||||
# From before we enabled image layer compression.
|
||||
# - IndexPart::LATEST_VERSION 7
|
||||
# - STORAGE_FORMAT_VERSION 3
|
||||
HistoricDataSet(
|
||||
"2024-07-18",
|
||||
TenantId("17bf64a53509714687664b3a84e9b3ba"),
|
||||
PgVersion.V16,
|
||||
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2024-07-18-pgv16.tar.zst",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataset", HISTORIC_DATA_SETS)
|
||||
@pytest.mark.xdist_group("compatibility")
|
||||
def test_historic_storage_formats(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
dataset: HistoricDataSet,
|
||||
):
|
||||
"""
|
||||
This test is like test_backward_compatibility, but it looks back further to examples of our storage format from long ago.
|
||||
"""
|
||||
|
||||
ARTIFACT_CACHE_DIR = "./artifact_cache"
|
||||
|
||||
import tarfile
|
||||
from contextlib import closing
|
||||
|
||||
import requests
|
||||
import zstandard
|
||||
|
||||
artifact_unpack_path = ARTIFACT_CACHE_DIR / Path("unpacked") / Path(dataset.name)
|
||||
|
||||
# Note: we assume that when running across a matrix of PG versions, the matrix includes all the versions needed by
|
||||
# HISTORIC_DATA_SETS. If we ever remove a PG version from the matrix, then historic datasets built using that version
|
||||
# will no longer be covered by this test.
|
||||
if pg_version != dataset.pg_version:
|
||||
pytest.skip(f"Dataset {dataset} is for different PG version, skipping")
|
||||
|
||||
with closing(requests.get(dataset.url, stream=True)) as r:
|
||||
unzstd = zstandard.ZstdDecompressor()
|
||||
with unzstd.stream_reader(r.raw) as stream:
|
||||
with tarfile.open(mode="r|", fileobj=stream) as tf:
|
||||
tf.extractall(artifact_unpack_path)
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.pg_version = dataset.pg_version
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage)
|
||||
|
||||
# Link artifact data into test's remote storage. We don't want the whole repo dir, just the remote storage part: we are not testing
|
||||
# compat of local disk data across releases (test_backward_compat does that), we're testing really long-lived data in S3 like layer files and indices.
|
||||
#
|
||||
# The code generating the snapshot uses local_fs, but this test uses S3Storage, so we are copying a tree of files into a bucket. We use
|
||||
# S3Storage so that the scrubber can run (the scrubber doesn't speak local_fs)
|
||||
artifact_pageserver_path = (
|
||||
artifact_unpack_path / Path("repo") / Path("local_fs_remote_storage") / Path("pageserver")
|
||||
)
|
||||
for root, _dirs, files in os.walk(artifact_pageserver_path):
|
||||
for file in files:
|
||||
local_path = os.path.join(root, file)
|
||||
remote_key = (
|
||||
env.pageserver_remote_storage.prefix_in_bucket
|
||||
+ str(local_path)[len(str(artifact_pageserver_path)) :]
|
||||
)
|
||||
log.info(f"Uploading {local_path} -> {remote_key}")
|
||||
env.pageserver_remote_storage.client.upload_file(
|
||||
local_path, env.pageserver_remote_storage.bucket_name, remote_key
|
||||
)
|
||||
|
||||
# Check the scrubber handles this old data correctly (can read it and doesn't consider it corrupt)
|
||||
#
|
||||
# Do this _before_ importing to the pageserver, as that import may start writing immediately
|
||||
healthy, metadata_summary = env.storage_scrubber.scan_metadata()
|
||||
assert healthy
|
||||
assert metadata_summary["tenant_count"] >= 1
|
||||
assert metadata_summary["timeline_count"] >= 1
|
||||
|
||||
env.neon_cli.import_tenant(dataset.tenant_id)
|
||||
|
||||
# Discover timelines
|
||||
timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id)
|
||||
# All our artifacts should contain at least one timeline
|
||||
assert len(timelines) > 0
|
||||
|
||||
# TODO: ensure that the snapshots we're importing contain a sensible variety of content, at the very
|
||||
# least they should include a mixture of deltas and image layers. Preferably they should also
|
||||
# contain some "exotic" stuff like aux files from logical replication.
|
||||
|
||||
# Check we can start an endpoint and read the SQL that the artifact is meant to contain
|
||||
reference_sql_dump = artifact_unpack_path / Path("dump.sql")
|
||||
ep = env.endpoints.create_start("main", tenant_id=dataset.tenant_id)
|
||||
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
|
||||
pg_bin.run_capture(
|
||||
["pg_dumpall", f"--dbname={ep.connstr()}", f"--file={test_output_dir / 'dump.sql'}"]
|
||||
)
|
||||
assert not dump_differs(
|
||||
reference_sql_dump,
|
||||
test_output_dir / "dump.sql",
|
||||
test_output_dir / "dump.filediff",
|
||||
)
|
||||
ep.stop()
|
||||
|
||||
# Check we can also do writes to the database
|
||||
existing_timeline_id = TimelineId(timelines[0]["timeline_id"])
|
||||
workload = Workload(env, dataset.tenant_id, existing_timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(100)
|
||||
|
||||
# Check that compaction works
|
||||
env.pageserver.http_client().timeline_compact(
|
||||
dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True
|
||||
)
|
||||
|
||||
@@ -214,12 +214,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Having written a mixture of generation-aware and legacy index_part.json,
|
||||
# ensure the scrubber handles the situation as expected.
|
||||
metadata_summary = env.storage_scrubber.scan_metadata()
|
||||
healthy, metadata_summary = env.storage_scrubber.scan_metadata()
|
||||
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
|
||||
assert metadata_summary["timeline_count"] == 1
|
||||
assert metadata_summary["timeline_shard_count"] == 1
|
||||
assert not metadata_summary["with_errors"]
|
||||
assert not metadata_summary["with_warnings"]
|
||||
assert healthy
|
||||
|
||||
|
||||
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -2,10 +2,11 @@ import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import TenantId, TimelineId
|
||||
from fixtures.common_types import TenantId, TenantShardId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
|
||||
from fixtures.pageserver.common_types import parse_layer_file_name
|
||||
@@ -437,6 +438,35 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
validate_heatmap(heatmap_second)
|
||||
|
||||
|
||||
def list_elegible_layers(
|
||||
pageserver, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
|
||||
) -> list[Path]:
|
||||
"""
|
||||
The subset of layer filenames that are elegible for secondary download: at time of writing this
|
||||
is all resident layers which are also visible.
|
||||
"""
|
||||
candidates = pageserver.list_layers(tenant_id, timeline_id)
|
||||
|
||||
layer_map = pageserver.http_client().layer_map_info(tenant_id, timeline_id)
|
||||
|
||||
# Map of layer filenames to their visibility the "layer name" is not the same as the filename: add suffix to resolve one to the other
|
||||
visible_map = dict(
|
||||
(f"{layer.layer_file_name}-v1-00000001", layer.visible)
|
||||
for layer in layer_map.historic_layers
|
||||
)
|
||||
|
||||
def is_visible(layer_file_name):
|
||||
try:
|
||||
return visible_map[str(layer_file_name)]
|
||||
except KeyError:
|
||||
# Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
|
||||
# matches what's on disk.
|
||||
log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
|
||||
raise
|
||||
|
||||
return list(c for c in candidates if is_visible(c))
|
||||
|
||||
|
||||
def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test the overall data flow in secondary mode:
|
||||
@@ -491,7 +521,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
assert list_elegible_layers(ps_attached, tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
)
|
||||
|
||||
@@ -509,9 +539,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
try:
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
)
|
||||
assert list_elegible_layers(
|
||||
ps_attached, tenant_id, timeline_id
|
||||
) == ps_secondary.list_layers(tenant_id, timeline_id)
|
||||
except:
|
||||
# Do a full listing of the secondary location on errors, to help debug of
|
||||
# https://github.com/neondatabase/neon/issues/6966
|
||||
@@ -532,8 +562,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
# ==================================================================
|
||||
try:
|
||||
log.info("Evicting a layer...")
|
||||
layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0]
|
||||
some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1]
|
||||
layer_to_evict = list_elegible_layers(ps_attached, tenant_id, timeline_id)[0]
|
||||
some_other_layer = list_elegible_layers(ps_attached, tenant_id, timeline_id)[1]
|
||||
log.info(f"Victim layer: {layer_to_evict.name}")
|
||||
ps_attached.http_client().evict_layer(
|
||||
tenant_id, timeline_id, layer_name=layer_to_evict.name
|
||||
@@ -551,9 +581,9 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id)
|
||||
assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers(
|
||||
tenant_id, timeline_id
|
||||
)
|
||||
assert list_elegible_layers(
|
||||
ps_attached, tenant_id, timeline_id
|
||||
) == ps_secondary.list_layers(tenant_id, timeline_id)
|
||||
except:
|
||||
# On assertion failures, log some details to help with debugging
|
||||
heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
|
||||
@@ -563,7 +593,8 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
# Scrub the remote storage
|
||||
# ========================
|
||||
# This confirms that the scrubber isn't upset by the presence of the heatmap
|
||||
env.storage_scrubber.scan_metadata()
|
||||
healthy, _ = env.storage_scrubber.scan_metadata()
|
||||
assert healthy
|
||||
|
||||
# Detach secondary and delete tenant
|
||||
# ===================================
|
||||
|
||||
@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.common_types import parse_layer_file_name
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
@@ -313,6 +312,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
|
||||
def churn_while_failpoints_active(result):
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("c")
|
||||
# this call will wait for the failpoints to be turned off
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver("d")
|
||||
@@ -332,8 +332,8 @@ def test_remote_storage_upload_queue_retries(
|
||||
# Exponential back-off in upload queue, so, gracious timeouts.
|
||||
|
||||
wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
|
||||
wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
|
||||
wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
|
||||
wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
|
||||
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
|
||||
|
||||
# unblock churn operations
|
||||
configure_storage_sync_failpoints("off")
|
||||
@@ -769,11 +769,11 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv
|
||||
create_thread.join()
|
||||
|
||||
|
||||
def test_compaction_waits_for_upload(
|
||||
def test_paused_upload_stalls_checkpoint(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
This test forces a race between upload and compaction.
|
||||
This test checks that checkpoints block on uploads to remote storage.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
@@ -788,6 +788,10 @@ def test_compaction_waits_for_upload(
|
||||
}
|
||||
)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
@@ -808,76 +812,9 @@ def test_compaction_waits_for_upload(
|
||||
endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)")
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
deltas_at_first = len(client.layer_map_info(tenant_id, timeline_id).delta_layers())
|
||||
assert (
|
||||
deltas_at_first == 2
|
||||
), "are you fixing #5863? just add one more checkpoint after 'CREATE TABLE bar ...' statement."
|
||||
|
||||
endpoint.safe_psql("CREATE TABLE bar AS SELECT x FROM generate_series(1, 10000) g(x)")
|
||||
endpoint.safe_psql("UPDATE foo SET x = 0 WHERE x = 1")
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
layers_before_last_checkpoint = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
|
||||
upload_stuck_layers = layers_before_last_checkpoint - layers_at_creation.historic_by_name()
|
||||
|
||||
assert len(upload_stuck_layers) > 0
|
||||
|
||||
for name in upload_stuck_layers:
|
||||
assert env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
), "while uploads are stuck the layers should be present on disk"
|
||||
|
||||
# now this will do the L0 => L1 compaction and want to remove
|
||||
# upload_stuck_layers and the original initdb L0
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# as uploads are paused, the upload_stuck_layers should still be with us
|
||||
for name in upload_stuck_layers:
|
||||
assert env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
), "uploads are stuck still over compaction"
|
||||
|
||||
compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name()
|
||||
overlap = compacted_layers.intersection(upload_stuck_layers)
|
||||
assert len(overlap) == 0, "none of the L0's should remain after L0 => L1 compaction"
|
||||
assert (
|
||||
len(compacted_layers) == 1
|
||||
), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)"
|
||||
|
||||
def layer_deletes_completed():
|
||||
m = client.get_metric_value("pageserver_layer_completed_deletes_total")
|
||||
if m is None:
|
||||
return 0
|
||||
return int(m)
|
||||
|
||||
# if initdb created an initial delta layer, it might already be gc'd
|
||||
# because it was uploaded before the failpoint was enabled. however, the
|
||||
# deletion is not guaranteed to be complete.
|
||||
assert layer_deletes_completed() <= 1
|
||||
|
||||
client.configure_failpoints(("before-upload-layer-pausable", "off"))
|
||||
|
||||
# Ensure that this actually terminates
|
||||
wait_upload_queue_empty(client, tenant_id, timeline_id)
|
||||
|
||||
def until_layer_deletes_completed():
|
||||
deletes = layer_deletes_completed()
|
||||
log.info(f"layer_deletes: {deletes}")
|
||||
# ensure that initdb delta layer AND the previously stuck are now deleted
|
||||
assert deletes >= len(upload_stuck_layers) + 1
|
||||
|
||||
wait_until(10, 1, until_layer_deletes_completed)
|
||||
|
||||
for name in upload_stuck_layers:
|
||||
assert not env.pageserver.layer_exists(
|
||||
tenant_id, timeline_id, parse_layer_file_name(name)
|
||||
), "l0 should now be removed because of L0 => L1 compaction and completed uploads"
|
||||
|
||||
# We should not have hit the error handling path in uploads where a uploaded file is gone
|
||||
assert not env.pageserver.log_contains(
|
||||
"File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."
|
||||
)
|
||||
with pytest.raises(ReadTimeout):
|
||||
client.timeline_checkpoint(tenant_id, timeline_id, timeout=5)
|
||||
client.configure_failpoints(("before-upload-layer-pausable", "off"))
|
||||
|
||||
|
||||
def wait_upload_queue_empty(
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user