Compare commits

..

1 Commits

Author SHA1 Message Date
Dmitry Rodionov
daac088c5e add plumber tool 2023-08-18 19:33:45 +03:00
105 changed files with 2937 additions and 6589 deletions

View File

@@ -145,11 +145,7 @@ runs:
if [ "${RERUN_FLAKY}" == "true" ]; then
mkdir -p $TEST_OUTPUT
poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
--days 7 \
--output "$TEST_OUTPUT/flaky.json" \
--pg-version "${DEFAULT_PG_VERSION}" \
--build-type "${BUILD_TYPE}"
poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
fi

View File

@@ -737,6 +737,34 @@ jobs:
--destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--cleanup
# Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
# During the transition period we need to have extensions in both places (in S3 and in compute-node image),
# so we won't build extension twice, but extract them from compute-node.
#
# For now we use extensions image only for new custom extensitons
- name: Kaniko build extensions only
run: |
# Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
# Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
# it still fails with error:
# error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
#
# Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
/kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
--context . \
--build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} \
--build-arg PG_VERSION=${{ matrix.version }} \
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
--dockerfile Dockerfile.compute-node \
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
--cleanup \
--target postgres-extensions
# Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
@@ -752,7 +780,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.17.5
VM_BUILDER_VERSION: v0.15.4
steps:
- name: Checkout
@@ -773,11 +801,7 @@ jobs:
- name: Build vm image
run: |
./vm-builder \
-enable-file-cache \
-cgroup-uid=postgres \
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
- name: Pushing vm-compute-node image
run: |
@@ -858,8 +882,10 @@ jobs:
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest
- name: Push images to production ECR
if: |
@@ -870,8 +896,10 @@ jobs:
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest
- name: Configure Docker Hub login
run: |
@@ -893,56 +921,65 @@ jobs:
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
build-private-extensions:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
needs: [ tag ]
upload-postgres-extensions-to-s3:
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
needs: [ tag, promote-images ]
strategy:
fail-fast: false
matrix:
version: [ v14, v15 ]
env:
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
steps:
- name: Set PR's status to pending and request a remote CI test
- name: Pull postgres-extensions image
run: |
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"
docker pull ${EXTENSIONS_IMAGE}
curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"state\": \"pending\",
\"context\": \"build-and-upload-extensions\",
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
}"
- name: Create postgres-extensions container
id: create-container
run: |
EID=$(docker create ${EXTENSIONS_IMAGE} true)
echo "EID=${EID}" >> $GITHUB_OUTPUT
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/build_and_upload_extensions.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"build-and-upload-extensions\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"remote_branch_name\": \"${{ github.ref_name }}\"
}
}"
- name: Extract postgres-extensions from container
run: |
rm -rf ./extensions-to-upload # Just in case
mkdir -p extensions-to-upload
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
- name: Upload postgres-extensions to S3
run: |
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
done
- name: Cleanup
if: ${{ always() && steps.create-container.outputs.EID }}
run: |
docker rm ${{ steps.create-container.outputs.EID }} || true
deploy:
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
needs: [ promote-images, tag, regress-tests ]
needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps:
- name: Fix git ownership

View File

@@ -1,12 +1,11 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/compute_tools/ @neondatabase/control-plane
/control_plane/ @neondatabase/compute @neondatabase/storage
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
/pageserver/ @neondatabase/compute @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/pageserver/ @neondatabase/compute @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/proxy
/proxy/ @neondatabase/control-plane
/safekeeper/ @neondatabase/safekeepers
/vendor/ @neondatabase/compute

246
Cargo.lock generated
View File

@@ -190,7 +190,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -201,7 +201,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -553,13 +553,12 @@ dependencies = [
[[package]]
name = "axum"
version = "0.6.20"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39"
dependencies = [
"async-trait",
"axum-core",
"base64 0.21.1",
"bitflags",
"bytes",
"futures-util",
@@ -574,13 +573,7 @@ dependencies = [
"pin-project-lite",
"rustversion",
"serde",
"serde_json",
"serde_path_to_error",
"serde_urlencoded",
"sha1",
"sync_wrapper",
"tokio",
"tokio-tungstenite 0.20.0",
"tower",
"tower-layer",
"tower-service",
@@ -680,7 +673,7 @@ dependencies = [
"regex",
"rustc-hash",
"shlex",
"syn 2.0.28",
"syn 2.0.16",
"which",
]
@@ -772,19 +765,6 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "cgroups-rs"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fb3af90c8d48ad5f432d8afb521b5b40c2a2fce46dd60e05912de51c47fba64"
dependencies = [
"libc",
"log",
"nix 0.25.1",
"regex",
"thiserror",
]
[[package]]
name = "chrono"
version = "0.4.24"
@@ -869,7 +849,7 @@ dependencies = [
"heck",
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -927,7 +907,6 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-compression",
"cfg-if",
"chrono",
"clap",
"compute_api",
@@ -946,7 +925,6 @@ dependencies = [
"tar",
"tokio",
"tokio-postgres",
"tokio-util",
"toml_edit",
"tracing",
"tracing-opentelemetry",
@@ -954,7 +932,6 @@ dependencies = [
"tracing-utils",
"url",
"utils",
"vm_monitor",
"workspace_hack",
"zstd",
]
@@ -1001,7 +978,7 @@ dependencies = [
"comfy-table",
"compute_api",
"git-version",
"nix 0.26.2",
"nix",
"once_cell",
"pageserver_api",
"postgres",
@@ -1207,7 +1184,7 @@ dependencies = [
"proc-macro2",
"quote",
"strsim",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -1218,7 +1195,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
dependencies = [
"darling_core",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -1283,7 +1260,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -1339,7 +1316,7 @@ dependencies = [
"darling",
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -1535,7 +1512,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -1886,8 +1863,8 @@ dependencies = [
"hyper",
"pin-project",
"tokio",
"tokio-tungstenite 0.18.0",
"tungstenite 0.18.0",
"tokio-tungstenite",
"tungstenite",
]
[[package]]
@@ -1951,19 +1928,6 @@ dependencies = [
"libc",
]
[[package]]
name = "inotify"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdd168d97690d0b8c412d6b6c10360277f4d7ee495c5d0d5d5fe0854923255cc"
dependencies = [
"bitflags",
"futures-core",
"inotify-sys",
"libc",
"tokio",
]
[[package]]
name = "inotify-sys"
version = "0.1.5"
@@ -2287,18 +2251,6 @@ dependencies = [
"tempfile",
]
[[package]]
name = "nix"
version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4"
dependencies = [
"autocfg",
"bitflags",
"cfg-if",
"libc",
]
[[package]]
name = "nix"
version = "0.26.2"
@@ -2333,7 +2285,7 @@ dependencies = [
"crossbeam-channel",
"filetime",
"fsevent-sys",
"inotify 0.9.6",
"inotify",
"kqueue",
"libc",
"mio",
@@ -2341,15 +2293,6 @@ dependencies = [
"windows-sys 0.45.0",
]
[[package]]
name = "ntapi"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
dependencies = [
"winapi",
]
[[package]]
name = "num-bigint"
version = "0.4.3"
@@ -2443,7 +2386,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -2630,7 +2573,7 @@ dependencies = [
"hyper",
"itertools",
"metrics",
"nix 0.26.2",
"nix",
"num-traits",
"num_cpus",
"once_cell",
@@ -2653,7 +2596,6 @@ dependencies = [
"serde_json",
"serde_with",
"signal-hook",
"smallvec",
"storage_broker",
"strum",
"strum_macros",
@@ -2831,7 +2773,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -3028,7 +2970,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
dependencies = [
"proc-macro2",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -3039,9 +2981,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
[[package]]
name = "proc-macro2"
version = "1.0.66"
version = "1.0.64"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
checksum = "78803b62cbf1f46fde80d7c0e803111524b9877184cfe7c3033659490ac7a7da"
dependencies = [
"unicode-ident",
]
@@ -3203,9 +3145,9 @@ dependencies = [
[[package]]
name = "quote"
version = "1.0.32"
version = "1.0.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965"
checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
dependencies = [
"proc-macro2",
]
@@ -3627,9 +3569,9 @@ dependencies = [
[[package]]
name = "rustls-webpki"
version = "0.100.2"
version = "0.100.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
dependencies = [
"ring",
"untrusted",
@@ -3856,22 +3798,22 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.183"
version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.183"
version = "1.0.163"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -3885,16 +3827,6 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_path_to_error"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4beec8bce849d58d06238cb50db2e1c417cfeafa4c63f692b15c82b7c80f8335"
dependencies = [
"itoa",
"serde",
]
[[package]]
name = "serde_spanned"
version = "0.6.2"
@@ -3941,7 +3873,7 @@ dependencies = [
"darling",
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -4040,9 +3972,9 @@ dependencies = [
[[package]]
name = "smallvec"
version = "1.11.0"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
[[package]]
name = "socket2"
@@ -4179,9 +4111,9 @@ dependencies = [
[[package]]
name = "syn"
version = "2.0.28"
version = "2.0.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04361975b3f5e348b2189d8dc55bc942f278b2d482a6a0365de5bdd62d351567"
checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
dependencies = [
"proc-macro2",
"quote",
@@ -4206,30 +4138,15 @@ dependencies = [
"unicode-xid",
]
[[package]]
name = "sysinfo"
version = "0.29.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "165d6d8539689e3d3bc8b98ac59541e1f21c7de7c85d60dc80e43ae0ed2113db"
dependencies = [
"cfg-if",
"core-foundation-sys",
"libc",
"ntapi",
"once_cell",
"rayon",
"winapi",
]
[[package]]
name = "tar"
version = "0.4.40"
version = "0.4.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb"
checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
dependencies = [
"filetime",
"libc",
"xattr",
"xattr 0.2.3",
]
[[package]]
@@ -4311,7 +4228,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -4426,7 +4343,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -4520,7 +4437,7 @@ dependencies = [
"redox_syscall 0.3.5",
"tokio",
"tokio-stream",
"xattr",
"xattr 1.0.0",
]
[[package]]
@@ -4532,19 +4449,7 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.18.0",
]
[[package]]
name = "tokio-tungstenite"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b2dbec703c26b00d74844519606ef15d09a7d6857860f84ad223dec002ddea2"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.20.0",
"tungstenite",
]
[[package]]
@@ -4736,7 +4641,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
]
[[package]]
@@ -4865,25 +4770,6 @@ dependencies = [
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
dependencies = [
"byteorder",
"bytes",
"data-encoding",
"http",
"httparse",
"log",
"rand",
"sha1",
"thiserror",
"url",
"utf-8",
]
[[package]]
name = "typenum"
version = "1.16.0"
@@ -5011,10 +4897,9 @@ dependencies = [
"hyper",
"jsonwebtoken",
"metrics",
"nix 0.26.2",
"nix",
"once_cell",
"pin-project-lite",
"postgres_connection",
"pq_proto",
"rand",
"regex",
@@ -5030,7 +4915,6 @@ dependencies = [
"thiserror",
"tokio",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-error",
"tracing-subscriber",
@@ -5067,28 +4951,6 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "vm_monitor"
version = "0.1.0"
dependencies = [
"anyhow",
"axum",
"cgroups-rs",
"clap",
"futures",
"inotify 0.10.2",
"serde",
"serde_json",
"sysinfo",
"tokio",
"tokio-postgres",
"tokio-stream",
"tokio-util",
"tracing",
"tracing-subscriber",
"workspace_hack",
]
[[package]]
name = "vsimd"
version = "0.8.0"
@@ -5159,7 +5021,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
"wasm-bindgen-shared",
]
@@ -5193,7 +5055,7 @@ checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.28",
"syn 2.0.16",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
@@ -5478,14 +5340,12 @@ name = "workspace_hack"
version = "0.1.0"
dependencies = [
"anyhow",
"axum",
"bytes",
"cc",
"chrono",
"clap",
"clap_builder",
"crossbeam-utils",
"digest",
"either",
"fail",
"futures",
@@ -5494,7 +5354,6 @@ dependencies = [
"futures-executor",
"futures-sink",
"futures-util",
"hyper",
"itertools",
"libc",
"log",
@@ -5513,10 +5372,9 @@ dependencies = [
"scopeguard",
"serde",
"serde_json",
"smallvec",
"socket2 0.4.9",
"syn 1.0.109",
"syn 2.0.28",
"syn 2.0.16",
"tokio",
"tokio-rustls 0.23.4",
"tokio-util",
@@ -5525,6 +5383,7 @@ dependencies = [
"tower",
"tracing",
"tracing-core",
"tracing-subscriber",
"url",
]
@@ -5545,6 +5404,15 @@ dependencies = [
"time",
]
[[package]]
name = "xattr"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc"
dependencies = [
"libc",
]
[[package]]
name = "xattr"
version = "1.0.0"

View File

@@ -23,7 +23,6 @@ members = [
"libs/remote_storage",
"libs/tracing-utils",
"libs/postgres_ffi/wal_craft",
"libs/vm_monitor",
]
[workspace.package]
@@ -42,14 +41,12 @@ aws-sdk-s3 = "0.27"
aws-smithy-http = "0.55"
aws-credential-types = "0.55"
aws-types = "0.55"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.65"
bstr = "1.0"
byteorder = "1.4"
bytes = "1.0"
cfg-if = "1.0.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
clap = { version = "4.0", features = ["derive"] }
close_fds = "0.3.2"
@@ -77,7 +74,6 @@ humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.9"
inotify = "0.10.2"
itertools = "0.10"
jsonwebtoken = "8"
libc = "0.2"
@@ -109,14 +105,12 @@ rustls = "0.20"
rustls-pemfile = "1"
rustls-split = "0.3"
scopeguard = "1.1"
sysinfo = "0.29.2"
sentry = { version = "0.30", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "2.0"
sha2 = "0.10.2"
signal-hook = "0.3"
smallvec = "1.11"
socket2 = "0.5"
strum = "0.24"
strum_macros = "0.24"
@@ -139,7 +133,7 @@ tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.19.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter"] }
url = "2.2"
uuid = { version = "1.2", features = ["v4", "serde"] }
walkdir = "2.3.2"
@@ -175,7 +169,6 @@ storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main br
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
## Common library dependency
workspace_hack = { version = "0.1", path = "./workspace_hack/" }

View File

@@ -211,8 +211,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
FROM build-deps AS vector-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -764,6 +764,29 @@ RUN rm -r /usr/local/pgsql/include
# if they were to be used by other libraries.
RUN rm /usr/local/pgsql/lib/lib*.a
#########################################################################################
#
# Extenstion only
#
#########################################################################################
FROM python:3.9-slim-bullseye AS generate-ext-index
ARG PG_VERSION
ARG BUILD_TAG
RUN apt update && apt install -y zstd
# copy the control files here
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
COPY --from=pg-anon-pg-build /extensions/ /extensions/
COPY --from=postgis-build /extensions/ /extensions/
COPY scripts/combine_control_files.py ./combine_control_files.py
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
FROM scratch AS postgres-extensions
# After the transition this layer will include all extensitons.
# As for now, it's only a couple for testing purposses
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
COPY --from=generate-ext-index /ext_index.json /ext_index.json
#########################################################################################
#
# Final layer

View File

@@ -8,7 +8,6 @@ license.workspace = true
anyhow.workspace = true
async-compression.workspace = true
chrono.workspace = true
cfg-if.workspace = true
clap.workspace = true
flate2.workspace = true
futures.workspace = true
@@ -24,7 +23,6 @@ tar.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
@@ -36,5 +34,4 @@ utils.workspace = true
workspace_hack.workspace = true
toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
zstd = "0.12.4"

View File

@@ -19,10 +19,9 @@ Also `compute_ctl` spawns two separate service threads:
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
last activity requests.
If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
`vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
`vm-monitor` communicates with the VM autoscaling system. It coordinates
downscaling and requests immediate upscaling under resource pressure.
If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
downscaling and (eventually) will request immediate upscaling under resource pressure.
Usage example:
```sh

View File

@@ -20,10 +20,9 @@
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
//! last activity requests.
//!
//! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
//! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
//! `vm-monitor` communicates with the VM autoscaling system. It coordinates
//! downscaling and requests immediate upscaling under resource pressure.
//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
//! downscaling and (eventually) will request immediate upscaling under resource pressure.
//!
//! Usage example:
//! ```sh
@@ -36,6 +35,7 @@
//!
use std::collections::HashMap;
use std::fs::File;
use std::panic;
use std::path::Path;
use std::process::exit;
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
@@ -271,57 +271,6 @@ fn main() -> Result<()> {
}
};
// Start the vm-monitor if directed to. The vm-monitor only runs on linux
// because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
use std::env;
use tokio_util::sync::CancellationToken;
use tracing::warn;
let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
let cgroup = matches.get_one::<String>("cgroup");
let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
// Only make a runtime if we need to.
// Note: it seems like you can make a runtime in an inner scope and
// if you start a task in it it won't be dropped. However, make it
// in the outermost scope just to be safe.
let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
(None, None) => None,
(None, Some(_)) => {
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
None
}
(Some(_), None) => {
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
}
(Some(_), Some(_)) => Some(
tokio::runtime::Builder::new_multi_thread()
.worker_threads(4)
.enable_all()
.build()
.expect("failed to create tokio runtime for monitor"),
),
};
// This token is used internally by the monitor to clean up all threads
let token = CancellationToken::new();
let vm_monitor = &rt.as_ref().map(|rt| {
rt.spawn(vm_monitor::start(
Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(),
pgconnstr: file_cache_connstr.cloned(),
addr: vm_monitor_addr.cloned().unwrap(),
file_cache_on_disk,
})),
token.clone(),
))
});
}
}
// Wait for the child Postgres process forever. In this state Ctrl+C will
// propagate to Postgres and it will be shut down as well.
if let Some(mut pg) = pg {
@@ -335,24 +284,6 @@ fn main() -> Result<()> {
exit_code = ecode.code()
}
// Terminate the vm_monitor so it releases the file watcher on
// /sys/fs/cgroup/neon-postgres.
// Note: the vm-monitor only runs on linux because it requires cgroups.
cfg_if::cfg_if! {
if #[cfg(target_os = "linux")] {
if let Some(handle) = vm_monitor {
// Kills all threads spawned by the monitor
token.cancel();
// Kills the actual task running the monitor
handle.abort();
// If handle is some, rt must have been used to produce it, and
// hence is also some
rt.unwrap().shutdown_timeout(Duration::from_secs(2));
}
}
}
// Maybe sync safekeepers again, to speed up next startup
let compute_state = compute.state.lock().unwrap().clone();
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
@@ -462,34 +393,6 @@ fn cli() -> clap::Command {
.long("remote-ext-config")
.value_name("REMOTE_EXT_CONFIG"),
)
// TODO(fprasx): we currently have default arguments because the cloud PR
// to pass them in hasn't been merged yet. We should get rid of them once
// the PR is merged.
.arg(
Arg::new("vm-monitor-addr")
.long("vm-monitor-addr")
.default_value("0.0.0.0:10301")
.value_name("VM_MONITOR_ADDR"),
)
.arg(
Arg::new("cgroup")
.long("cgroup")
.default_value("neon-postgres")
.value_name("CGROUP"),
)
.arg(
Arg::new("filecache-connstr")
.long("filecache-connstr")
.default_value(
"host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
)
.value_name("FILECACHE_CONNSTR"),
)
.arg(
Arg::new("file-cache-on-disk")
.long("file-cache-on-disk")
.action(clap::ArgAction::SetTrue),
)
}
#[test]

View File

@@ -1,5 +1,4 @@
use std::collections::HashMap;
use std::env;
use std::fs;
use std::io::BufRead;
use std::os::unix::fs::PermissionsExt;
@@ -176,27 +175,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
}
}
/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
/// cgroup. Otherwise returns the default `Command::new(cmd)`
///
/// This function should be used to start postgres, as it will start it in the
/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
/// creates it during the sysinit phase of its inittab.
fn maybe_cgexec(cmd: &str) -> Command {
// The cplane sets this env var for autoscaling computes.
// use `var_os` so we don't have to worry about the variable being valid
// unicode. Should never be an concern . . . but just in case
if env::var_os("AUTOSCALING").is_some() {
let mut command = Command::new("cgexec");
command.args(["-g", "memory:neon-postgres"]);
command.arg(cmd);
command
} else {
Command::new(cmd)
}
}
/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
/// that we give to customers
fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
@@ -473,7 +451,7 @@ impl ComputeNode {
pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
let start_time = Utc::now();
let sync_handle = maybe_cgexec(&self.pgbin)
let sync_handle = Command::new(&self.pgbin)
.args(["--sync-safekeepers"])
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
.envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -608,7 +586,7 @@ impl ComputeNode {
// Start postgres
info!("starting postgres");
let mut pg = maybe_cgexec(&self.pgbin)
let mut pg = Command::new(&self.pgbin)
.args(["-D", pgdata])
.spawn()
.expect("cannot start postgres process");
@@ -636,7 +614,7 @@ impl ComputeNode {
let pgdata_path = Path::new(&self.pgdata);
// Run postgres as a child process.
let mut pg = maybe_cgexec(&self.pgbin)
let mut pg = Command::new(&self.pgbin)
.args(["-D", &self.pgdata])
.envs(if let Some(storage_auth_token) = &storage_auth_token {
vec![("NEON_AUTH_TOKEN", storage_auth_token)]

View File

@@ -4,12 +4,7 @@
# to your expectations and requirements.
# Root options
targets = [
{ triple = "x86_64-unknown-linux-gnu" },
{ triple = "aarch64-unknown-linux-gnu" },
{ triple = "aarch64-apple-darwin" },
{ triple = "x86_64-apple-darwin" },
]
targets = []
all-features = false
no-default-features = false
feature-depth = 1
@@ -23,7 +18,7 @@ vulnerability = "deny"
unmaintained = "warn"
yanked = "warn"
notice = "warn"
ignore = ["RUSTSEC-2023-0052"]
ignore = []
# This section is considered when running `cargo deny check licenses`
# More documentation for the licenses section can be found here:

View File

@@ -1,957 +0,0 @@
# Pageserver: split-brain safety for remote storage through generation numbers
## Summary
A scheme of logical "generation numbers" for tenant attachment to pageservers is proposed, along with
changes to the remote storage format to include these generation numbers in S3 keys.
Using the control plane as the issuer of these generation numbers enables strong anti-split-brain
properties in the pageserver cluster without implementing a consensus mechanism directly
in the pageservers.
## Motivation
Currently, the pageserver's remote storage format does not provide a mechanism for addressing
split brain conditions that may happen when replacing a node or when migrating
a tenant from one pageserver to another.
From a remote storage perspective, a split brain condition occurs whenever two nodes both think
they have the same tenant attached, and both can write to S3. This can happen in the case of a
network partition, pathologically long delays (e.g. suspended VM), or software bugs.
In the current deployment model, control plane guarantees that a tenant is attached to one
pageserver at a time, thereby ruling out split-brain conditions resulting from dual
attachment (however, there is always the risk of a control plane bug). This control
plane guarantee prevents robust response to failures, as if a pageserver is unresponsive
we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
attach to a new, different pageserver even if an unresponsive pageserver may be running.
Futher, lack of safety during split-brain conditions blocks two important features where occasional
split-brain conditions are part of the design assumptions:
- seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
- automatic pageserver instance failure handling (aka "failover") (RFC TBD)
### Prior art
- 020-pageserver-s3-coordination.md
- 023-the-state-of-pageserver-tenant-relocation.md
- 026-pageserver-s3-mvcc.md
This RFC has broad similarities to the proposal to implement a MVCC scheme in
S3 object names, but this RFC avoids a general purpose transaction scheme in
favour of more specialized "generations" that work like a transaction ID that
always has the same lifetime as a pageserver process or tenant attachment, whichever
is shorter.
## Requirements
- Accommodate storage backends with no atomic or fencing capability (i.e. work within
S3's limitation that there are no atomics and clients can't be fenced)
- Don't depend on any STONITH or node fencing in the compute layer (i.e. we will not
assume that we can reliably kill and EC2 instance and have it die)
- Scoped per-tenant, not per-pageserver; for _seamless tenant migration_, we need
per-tenant granularity, and for _failover_, we likely want to spread the workload
of the failed pageserver instance to a number of peers, rather than monolithically
moving the entire workload to another machine.
We do not rule out the latter case, but should not constrain ourselves to it.
## Design Tenets
These are not requirements, but are ideas that guide the following design:
- Avoid implementing another consensus system: we already have a strongly consistent
database in the control plane that can do atomic operations where needed, and we also
have a Paxos implementation in the safekeeper.
- Avoiding locking in to specific models of how failover will work (e.g. do not assume that
all the tenants on a pageserver will fail over as a unit).
- Be strictly correct when it comes to data integrity. Occasional failures of availability
are tolerable, occasional data loss is not.
## Non Goals
The changes in this RFC intentionally isolate the design decision of how to define
logical generations numbers and object storage format in a way that is somewhat flexible with
respect to how actual orchestration of failover works.
This RFC intentionally does not cover:
- Failure detection
- Orchestration of failover
- Standby modes to keep data ready for fast migration
- Intentional multi-writer operation on tenants (multi-writer scenarios are assumed to be transient split-brain situations).
- Sharding.
The interaction between this RFC and those features is discussed in [Appendix B](#appendix-b-interoperability-with-other-features)
## Impacted Components
pageserver, control plane, safekeeper (optional)
## Implementation Part 1: Correctness
### Summary
- A per-tenant **generation number** is introduced to uniquely identifying tenant attachments to pageserver processes.
- This generation number increments each time the control plane modifies a tenant (`Project`)'s assigned pageserver, or when the assigned pageserver restarts.
- the control plane is the authority for generation numbers: only it may
increment a generation number.
- **Object keys are suffixed** with the generation number
- **Safety for multiply-attached tenants** is provided by the
generation number in the object key: the competing pageservers will not
try to write to the same keys.
- **Safety in split brain for multiple nodes running with
the same node ID** is provided by the pageserver calling out to the control plane
on startup, to re-attach and thereby increment the generations of any attached tenants
- **Safety for deletions** is achieved by deferring the DELETE from S3 to a point in time where the deleting node has validated with control plane that no attachment with a higher generation has a reference to the to-be-DELETEd key.
- **The control plane is used to issue generation numbers** to avoid the need for
a built-in consensus system in the pageserver, although this could in principle
be changed without changing the storage format.
### Generation numbers
A generation number is associated with each tenant in the control plane,
and each time the attachment status of the tenant changes, this is incremented.
Changes in attachment status include:
- Attaching the tenant to a different pageserver
- A pageserver restarting, and "re-attaching" its tenants on startup
These increments of attachment generation provide invariants we need to avoid
split-brain issues in storage:
- If two pageservers have the same tenant attached, the attachments are guaranteed to have different generation numbers, because the generation would increment
while attaching the second one.
- If there are multiple pageservers running with the same node ID, all the attachments on all pageservers are guaranteed to have different generation numbers, because the generation would increment
when the second node started and re-attached its tenants.
As long as the infrastructure does not transparently replace an underlying
physical machine, we are totally safe. See the later [unsafe case](#unsafe-case-on-badly-behaved-infrastructure) section for details.
### Object Key Changes
#### Generation suffix
All object keys (layer objects and index objects) will contain the attachment
generation as a [suffix](#why-a-generation-suffix-rather-than-prefix).
This suffix is the primary mechanism for protecting against split-brain situations, and
enabling safe multi-attachment of tenants:
- Two pageservers running with the same node ID (e.g. after a failure, where there is
some rogue pageserver still running) will not try to write to the same objects, because at startup they will have re-attached tenants and thereby incremented
generation numbers.
- Multiple attachments (to different pageservers) of the same tenant will not try to write to the same objects, as each attachment would have a distinct generation.
The generation is appended in hex format (8 byte string representing
u32), to all our existing key names. A u32's range limit would permit
27 restarts _per second_ over a 5 year system lifetime: orders of magnitude more than
is realistic.
The exact meaning of the generation suffix can evolve over time if necessary, for
example if we chose to implement a failover mechanism internally to the pageservers
rather than going via the control plane. The storage format just sees it as a number,
with the only semantic property being that the highest numbered index is the latest.
#### Index changes
Since object keys now include a generation suffix, the index of these keys must also be updated. IndexPart currently stores keys and LSNs sufficient to reconstruct key names: this would be extended to store the generation as well.
This will increase the size of the file, but only modestly: layers are already encoded as
their string-ized form, so the overhead is about 10 bytes per layer. This will be less if/when
the index storage format is migrated to a binary format from JSON.
#### Visibility
_This section doesn't describe code changes, but extends on the consequences of the
object key changes given above_
##### Visibility of objects to pageservers
Pageservers can of course list objects in S3 at any time, but in practice their
visible set is based on the contents of their LayerMap, which is initialized
from the `index_part.json.???` that they load.
Starting with the `index_part` from the most recent previous generation
(see [loading index_part](#finding-the-remote-indices-for-timelines)), a pageserver
initially has visibility of all the objects that were referenced in the loaded index.
These objects are guaranteed to remain visible until the current generation is
superseded, via pageservers in older generations avoiding deletions (see [deletion](#deletion)).
The "most recent previous generation" is _not_ necessarily the most recent
in terms of walltime, it is the one that is readable at the time a new generation
starts. Consider the following sequence of a tenant being re-attached to different
pageserver nodes:
- Create + attach on PS1 in generation 1
- PS1 Do some work, write out index_part.json-0001
- Attach to PS2 in generation 2
- Read index_part.json-0001
- PS2 starts doing some work...
- Attach to PS3 in generation 3
- Read index_part.json-0001
- **...PS2 finishes its work: now it writes index_part.json-0002**
- PS3 writes out index_part.json-0003
In the above sequence, the ancestry of indices is:
```
0001 -> 0002
|
-> 0003
```
This is not an issue for safety: if the 0002 references some object that is
not in 0001, then 0003 simply does not see it, and will re-do whatever
work was required (e.g. ingesting WAL or doing compaction). Objects referenced
by only the 0002 index will never be read by future attachment generations, and
will eventually be cleaned up by a scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)).
##### Visibility of LSNs to clients
Because index_part.json is now written with a generation suffix, which data
is visible depends on which generation the reader is operating in:
- If one was passively reading from S3 from outside of a pageserver, the
visibility of data would depend on which index_part.json-<generation> file
one had chosen to read from.
- If two pageservers have the same tenant attached, they may have different
data visible as they're independently replaying the WAL, and maintaining
independent LayerMaps that are written to independent index_part.json files.
Data does not have to be remotely committed to be visible.
- For a pageserver writing with a stale generation, historic LSNs
remain readable until another pageserver (with a higher generation suffix)
decides to execute GC deletions. At this point, we may think of the stale
attachment's generation as having logically ended: during its existence
the generation had a consistent view of the world.
- For a newly attached pageserver, its highest visible LSN may appears to
go backwards with respect to an earlier attachment, if that earlier
attachment had not uploaded all data to S3 before the new attachment.
### Deletion
#### Generation number validation
While writes are de-conflicted by writers always using their own generation number in the key,
deletions are slightly more challenging: if a pageserver A is isolated, and the true active node is
pageserver B, then it is dangerous for A to do any object deletions, even of objects that it wrote
itself, because pageserver's B metadata might reference those objects.
We solve this by inserting a "generation validation" step between the write of a remote index
that un-links a particular object from the index, and the actual deletion of the object, such
that deletions strictly obey the following ordering:
1. Write out index_part.json: this guarantees that any subsequent reader of the metadata will
not try and read the object we unlinked.
2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
3. If step 2 passes, it is safe to delete the object. Why? The check-in with control plane
together with our visibility rules guarantees that any later generation
will use either the exact `index_part.json` that we uploaded in step 1, or a successor
of it; not an earlier one. In both cases, the `index_part.json` doesn't reference the
key we are deleting anymore, so, the key is invisible to any later attachment generation.
Hence it's safe to delete it.
Note that at step 2 we are only confirming that deletions of objects _no longer referenced
by the specific `index_part.json` written in step 1_ are safe. If we were attempting other deletions concurrently,
these would need their own generation validation step.
If step 2 fails, we may leak the object. This is safe, but has a cost: see [scrubbing](#cleaning-up-orphan-objects-scrubbing). We may avoid this entirely outside of node
failures, if we do proper flushing of deletions on clean shutdown and clean migration.
To avoid doing a huge number of control plane requests to perform generation validation,
validation of many tenants will be done in a single request, and deletions will be queued up
prior to validation: see [Persistent deletion queue](#persistent-deletion-queue) for more.
#### `remote_consistent_lsn` updates
Remote objects are not the only kind of deletion the pageserver does: it also indirectly deletes
WAL data, by feeding back remote_consistent_lsn to safekeepers, as a signal to the safekeepers that
they may drop data below this LSN.
For the same reasons that deletion of objects must be guarded by an attachment generation number
validation step, updates to `remote_consistent_lsn` are subject to the same rules, using
an ordering as follows:
1. upload the index_part that covers data up to LSN `L0` to S3
2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
3. advance the `remote_consistent_lsn` that we advertise to the safekeepers to `L0`
If step 2 fails, then the `remote_consistent_lsn` advertised
to safekeepers will not advance again until a pageserver
with the latest generation is ready to do so.
**Note:** at step 3 we are not advertising the _latest_ remote_consistent_lsn, we are
advertising the value in the index_part that we uploaded in step 1. This provides
a strong ordering guarantee.
Internally to the pageserver, each timeline will have two remote_consistent_lsn values: the one that
reflects its latest write to remote storage, and the one that reflects the most
recent validation of generation number. It is only the latter value that may
be advertised to the outside world (i.e. to the safekeeper).
The control plane remains unaware of `remote_consistent_lsn`: it only has to validate
the freshness of generation numbers, thereby granting the pageserver permission to
share the information with the safekeeper.
For convenience, in subsequent sections and RFCs we will use "deletion" to mean both deletion
of objects in S3, and updates to the `remote_consistent_lsn`, as updates to the remote consistent
LSN are de-facto deletions done via the safekeeper, and both kinds of deletion are subject to
the same generation validation requirement.
### Pageserver attach/startup changes
#### Attachment
Calls to `/v1/tenant/{tenant_id}/attach` are augmented with an additional
`generation` field in the body.
The pageserver does not persist this: a generation is only good for the lifetime
of a process.
#### Finding the remote indices for timelines
Because index files are now suffixed with generation numbers, the pageserver
cannot always GET the remote index in one request, because it can't always
know a-priori what the latest remote index is.
Typically, the most recent generation to write an index would be our own
generation minus 1. However, this might not be the case: the previous
node might have started and acquired a generation number, and then crashed
before writing out a remote index.
In the general case and as a fallback, the pageserver may list all the `index_part.json`
files for a timeline, sort them by generation, and pick the highest that is `<=`
its current generation for this attachment. The tenant should never load an index
with an attachment generation _newer_ than its own.
These two rules combined ensure that objects written by later generations are never visible to earlier generations.
Note that if a given attachment picks an index part from an earlier generation (say n-2), but crashes & restarts before it writes its own generation's index part, next time it tries to pick an index part there may be an index part from generation n-1.
It would pick the n-1 index part in that case, because it's sorted higher than the previous one from generation n-2.
So, above rules guarantee no determinism in selecting the index part.
are allowed to be attached with stale attachment generations during a multiply-attached
phase in a migration, and in this instance if the old location's pageserver restarts,
it should not try and load the newer generation's index.
To summarize, on starting a timeline, the pageserver will:
1. Issue a GET for index_part.json-<my generation - 1>
2. If 1 failed, issue a ListObjectsv2 request for index_part.json\* and
pick the newest.
One could optimize this further by using the control plane to record specifically
which generation most recently wrote an index_part.json, if necessary, to increase
the probability of finding the index_part.json in one GET. One could also improve
the chances by having pageservers proactively write out index_part.json after they
get a new generation ID.
#### Re-attachment on startup
On startup, the pageserver will call out to an new control plane `/re-attach`
API (see [Generation API](#generation-api)). This returns a list of
tenants that should be attached to the pageserver, and their generation numbers, which
the control plane will increment before returning.
The pageserver should still scan its local disk on startup, but should _delete_
any local content for tenants not indicated in the `/re-attach` response: their
absence is an implicit detach operation.
**Note** if a tenant is omitted from the re-attach response, its local disk content
will be deleted. This will change in subsequent work, when the control plane gains
the concept of a secondary/standby location: a node with local content may revert
to this status and retain some local content.
#### Cleaning up previous generations' remote indices
Deletion of old indices is not necessary for correctness, although it is necessary
to avoid the ListObjects fallback in the previous section becoming ever more expensive.
Once the new attachment has written out its index_part.json, it may asynchronously clean up historic index_part.json
objects that were found.
We may choose to implement this deletion either as an explicit step after we
write out index_part for the first time in a pageserver's lifetime, or for
simplicity just do it periodically as part of the background scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing));
### Control Plane Changes
#### Store generations for attaching tenants
- The `Project` table must store the generation number for use when
attaching the tenant to a new pageserver.
- The `/v1/tenant/:tenant_id/attach` pageserver API will require the generation number,
which the control plane can supply by simply incrementing the `Project`'s
generation number each time the tenant is attached to a different server: the same database
transaction that changes the assigned pageserver should also change the generation number.
#### Generation API
This section describes an API that could be provided directly by the control plane,
or built as a separate microservice. In earlier parts of the RFC, when we
discuss the control plane providing generation numbers, we are referring to this API.
The API endpoints used by the pageserver to acquire and validate generation
numbers are quite simple, and only require access to some persistent and
linerizable storage (such as a database).
Building this into the control plane is proposed as a least-effort option to exploit existing infrastructure and implement generation number issuance in the same transaction that mandates it (i.e., the transaction that updates the `Project` assignment to another pageserver).
However, this is not mandatory: this "Generation Number Issuer" could
be built as a microservice. In practice, we will write such a miniature service
anyway, to enable E2E pageserver/compute testing without control plane.
The endpoints required by pageservers are:
##### `/re-attach`
- Request: `{node_id: <u32>}`
- Response:
- 200 `{tenants: [{id: <TenantId>, gen: <u32>}]}`
- 404: unknown node_id
- (Future: 429: flapping detected, perhaps nodes are fighting for the same node ID,
or perhaps this node was in a retry loop)
- (On unknown tenants, omit tenant from `tenants` array)
- Server behavior: query database for which tenants should be attached to this pageserver.
- for each tenant that should be attached, increment the attachment generation and
include the new generation in the response
- Client behavior:
- for all tenants in the response, activate with the new generation number
- for any local disk content _not_ referenced in the response, act as if we
had been asked to detach it (i.e. delete local files)
**Note** the `node_id` in this request will change in future if we move to ephemeral
node IDs, to be replaced with some correlation ID that helps the control plane realize
if a process is running with the same storage as a previous pageserver process (e.g.
we might use EC instance ID, or we might just write some UUID to the disk the first
time we use it)
##### `/validate`
- Request: `{'tenants': [{tenant: <tenant id>, attach_gen: <gen>}, ...]}'`
- Response:
- 200 `{'tenants': [{tenant: <tenant id>, status: <bool>}...]}`
- (On unknown tenants, omit tenant from `tenants` array)
- Purpose: enable the pageserver to discover for the given attachments whether they are still the latest.
- Server behavior: this is a read-only operation: simply compare the generations in the request with
the generations known to the server, and set status to `true` if they match.
- Client behavior: clients must not do deletions within a tenant's remote data until they have
received a response indicating the generation they hold for the attachment is current.
#### Use of `/load` and `/ignore` APIs
Because the pageserver will be changed to only attach tenants on startup
based on the control plane's response to a `/re-attach` request, the load/ignore
APIs no longer make sense in their current form.
The `/load` API becomes functionally equivalent to attach, and will be removed:
any location that used `/load` before should just attach instead.
The `/ignore` API is equivalent to detaching, but without deleting local files.
### Timeline/Branch creation & deletion
All of the previous arguments for safety have described operations within
a timeline, where we may describe a sequence that includes updates to
index_part.json, and where reads and writes are coming from a postgres
endpoint (writes via the safekeeper).
Creating or destroying timeline is a bit different, because writes
are coming from the control plane.
We must be safe against scenarios such as:
- A tenant is attached to pageserver B while pageserver A is
in the middle of servicing an RPC from the control plane to
create or delete a tenant.
- A pageserver A has been sent a timeline creation request
but becomes unresponsive. The tenant is attached to a
different pageserver B, and the timeline creation request
is sent there too.
#### Timeline Creation
If some very slow node tries to do a timeline creation _after_
a more recent generation node has already created the timeline
and written some data into it, that must not cause harm. This
is provided in timeline creations by the way all the objects
within the timeline's remote path include a generation suffix:
a slow node in an old generation that attempts to "create" a timeline
that already exists will just emit an index_part.json with
an old generation suffix.
Timeline IDs are never reused, so we don't have
to worry about the case of create/delete/create cycles. If they
were re-used during a disaster recovery "un-delete" of a timeline,
that special case can be handled by calling out to all available pageservers
to check that they return 404 for the timeline, and to flush their
deletion queues in case they had any deletions pending from the
timeline.
The above makes it safe for control plane to change the assignment of
tenant to pageserver in control plane while a timeline creation is ongoing.
The reason is that the creation request against the new assigned pageserver
uses a new generation number. However, care must be taken by control plane
to ensure that a "timeline creation successul" response from some pageserver
is checked for the pageserver's generation for that timeline's tenant still being the latest.
If it is not the latest, the response does not constitute a successful timeline creation.
It is acceptable to discard such responses, the scrubber will clean up the S3 state.
It is better to issue a timelien deletion request to the stale attachment.
#### Timeline Deletion
Tenant/timeline deletion operations are exempt from generation validation
on deletes, and therefore don't have to go through the same deletion
queue as GC/compaction layer deletions. This is because once a
delete is issued by the control plane, it is a promise that the
control plane will keep trying until the deletion is done, so even stale
pageservers are permitted to go ahead and delete the objects.
The implications of this for control plane are:
- During timeline/tenant deletion, the control plane must wait for the deletion to
be truly complete (status 404) and also handle the case where the pageserver
becomes unavailable, either by waiting for a replacement with the same node_id,
or by *re-attaching the tenant elsewhere.
- The control plane must persist its intent to delete
a timeline/tenant before issuing any RPCs, and then once it starts, it must
keep retrying until the tenant/timeline is gone. This is already handled
by using a persistent `Operation` record that is retried indefinitely.
Timeline deletion may result in a special kind of object leak, where
the latest generation attachment completes a deletion (including erasing
all objects in the timeline path), but some slow/partitioned node is
writing into the timeline path with a stale generation number. This would
not be caught by any per-timeline scrubbing (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)), since scrubbing happens on the
attached pageserver, and once the timeline is deleted it isn't attached anywhere.
This scenario should be pretty rare, and the control plane can make it even
rarer by ensuring that if a tenant is in a multi-attached state (e.g. during
migration), we wait for that to complete before processing the deletion. Beyond
that, we may implement some other top-level scrub of timelines in
an external tool, to identify any tenant/timeline paths that are not found
in the control plane database.
#### Examples
- Deletion, node restarts partway through:
- By the time we returned 202, we have written a remote delete marker
- Any subsequent incarnation of the same node_id will see the remote
delete marker and continue to process the deletion
- If the original pageserver is lost permanently and no replacement
with the same node_id is available, then the control plane must recover
by re-attaching the tenant to a different node.
- Creation, node becomes unresponsive partway through.
- Control plane will see HTTP request timeout, keep re-issuing
request to whoever is the latest attachment point for the tenant
until it succeeds.
- Stale nodes may be trying to execute timeline creation: they will
write out index_part.json files with
stale attachment generation: these will be eventually cleaned up
by the same mechanism as other old indices.
### Unsafe case on badly behaved infrastructure
This section is only relevant if running on a different environment
than EC2 machines with ephemeral disks.
If we ever run pageservers on infrastructure that might transparently restart
a pageserver while leaving an old process running (e.g. a VM gets rescheduled
without the old one being fenced), then there is a risk of corruption, when
the control plane attaches the tenant, as follows:
- If the control plane sends an `/attach` request to node A, then node A dies
and is replaced, and the control plane's retries the request without
incrementing that attachment ID, then it could end up with two physical nodes
both using the same generation number.
- This is not an issue when using EC2 instances with ephemeral storage, as long
as the control plane never re-uses a node ID, but it would need re-examining
if running on different infrastructure.
- To robustly protect against this class of issue, we would either:
- add a "node generation" to distinguish between different processes holding the
same node_id.
- or, dispense with static node_id entirely and issue an ephemeral ID to each
pageserver process when it starts.
## Implementation Part 2: Optimizations
### Persistent deletion queue
Between writing our a new index_part.json that doesn't reference an object,
and executing the deletion, an object passes through a window where it is
only referenced in memory, and could be leaked if the pageserver is stopped
uncleanly. That introduces conflicting incentives: on the one hand, we would
like to delay and batch deletions to
1. minimize the cost of the mandatory validations calls to control plane, and
2. minimize cost for DeleteObjects requests.
On the other hand we would also like to minimize leakage by executing
deletions promptly.
To resolve this, we may make the deletion queue persistent
and then executing these in the background at a later time.
_Note: The deletion queue's reason for existence is optimization rather than correctness,
so there is a lot of flexibility in exactly how the it should work,
as long as it obeys the rule to validate generations before executing deletions,
so the following details are not essential to the overall RFC._
#### Scope
The deletion queue will be global per pageserver, not per-tenant. There
are several reasons for this choice:
- Use the queue as a central point to coalesce validation requests to the
control plane: this avoids individual `Timeline` objects ever touching
the control plane API, and avoids them having to know the rules about
validating deletions. This separation of concerns will avoid burdening
the already many-LoC `Timeline` type with even more responsibility.
- Decouple the deletion queue from Tenant attachment lifetime: we may
"hibernate" an inactive tenant by tearing down its `Tenant`/`Timeline`
objects in the pageserver, without having to wait for deletions to be done.
- Amortize the cost of I/O for the persistent queue, instead of having many
tiny queues.
- Coalesce deletions into a smaller number of larger DeleteObjects calls
Because of the cost of doing I/O for persistence, and the desire to coalesce
generation validation requests across tenants, and coalesce deletions into
larger DeleteObjects requests, there will be one deletion queue per pageserver
rather than one per tenant. This has the added benefit that when deactivating
a tenant, we do not have to drain their deletion queue: deletions can proceed
for a tenant whose main `Tenant` object has been torn down.
#### Flow of deletion
The flow of a deletion is becomes:
1. Need for deletion of an object (=> layer file) is identified.
2. Unlink the object from all the places that reference it (=> `index_part.json`).
3. Enqueue the deletion to a persistent queue.
Each entry is `tenant_id, attachment_generation, S3 key`.
4. Validate & execute in batches:
4.1 For a batch of entries, call into control plane.
4.2 For the subset of entries that passed validation, execute a `DeleteObjects` S3 DELETE request for their S3 keys.
As outlined in the Part 1 on correctness, it is critical that deletions are only
executed once the key is not referenced anywhere in S3.
This property is obviously upheld by the scheme above.
#### We Accept Object Leakage In Acceptable Circumcstances
If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.
This is acceptable for now, it can be caught by [the scrubber](#cleaning-up-orphan-objects-scrubbing).
There are various measures we can take to improve this in the future.
1. Cap amount of time until enqueued entry becomes durable (timeout for flush-to-tisk)
2. Proactively flush:
- On graceful shutdown, as we anticipate that some or
all of our attachments may be re-assigned while we are offline.
- On tenant detach.
3. For each entry, keep track of whether it has passed (2).
Only admit entries to (4) one they have passed (2).
This requires re-writing / two queue entries (intent, commit) per deletion.
The important take-away with any of the above is that it's not
disastrous to leak objects in exceptional circumstances.
#### Operations that may skip the queue
Deletions of an entire timeline are [exempt](#Timeline-Deletion) from generation number validation. Once the
control plane sends the deletion request, there is no requirement to retain the readability
of any data within the timeline, and all objects within the timeline path may be deleted
at any time from the control plane's deletion request onwards.
Since deletions of smaller timelines won't have enough objects to compose a full sized
DeleteObjects request, it is still useful to send these through the last part of the
deletion pipeline to coalesce with other executing deletions: to enable this, the
deletion queue should expose two input channels: one for deletions that must be
processed in a generation-aware way, and a fast path for timeline deletions, where
that fast path may skip validation and the persistent queue.
### Cleaning up orphan objects (scrubbing)
An orphan object is any object which is no longer referenced by a running node or by metadata.
Examples of how orphan objects arise:
- A node PUTs a layer object, then crashes before it writes the
index_part.json that references that layer.
- A stale node carries on running for some time, and writes out an unbounded number of
objects while it believes itself to be the rightful writer for a tenant.
- A pageserver crashes between un-linking an object from the index, and persisting
the object to its deletion queue.
Orphan objects are functionally harmless, but have a small cost due to S3 capacity consumed. We
may clean them up at some time in the future, but doing a ListObjectsv2 operation and cross
referencing with the latest metadata to identify objects which are not referenced.
Scrubbing will be done only by an attached pageserver (not some third party process), and deletions requested during scrub will go through the same
validation as all other deletions: the attachment generation must be
fresh. This avoids the possibility of a stale pageserver incorrectly
thinking than an object written by a newer generation is stale, and deleting
it.
It is not strictly necessary that scrubbing be done by an attached
pageserver: it could also be done externally. However, an external
scrubber would still require the same validation procedure that
a pageserver's deletion queue performs, before actually erasing
objects.
## Operational impact
### Availability
Coordination of generation numbers via the control plane introduce a dependency for certain
operations:
1. Starting new pageservers (or activating pageservers after a restart)
2. Executing enqueued deletions
3. Advertising updated `remote_consistent_lsn` to enable WAL trimming
Item 1. would mean that some in-place restarts that previously would have resumed service even if the control plane were
unavailable, will now not resume service to users until the control plane is available. We could
avoid this by having a timeout on communication with the control plane, and after some timeout,
resume service with the previous generation numbers (assuming this was persisted to disk). However,
this is unlikely to be needed as the control plane is already an essential & highly available component. Also, having a node re-use an old generation number would complicate
reasoning about the system, as it would break the invariant that a generation number uniquely identifies
a tenant's attachment to a given pageserver _process_: it would merely identify the tenant's attachment
to the pageserver _machine_ or its _on-disk-state_.
Item 2. is a non-issue operationally: it's harmless to delay deletions, the only impact of objects pending deletion is
the S3 capacity cost.
Item 3. could be an issue if safekeepers are low on disk space and the control plane is unavailable for a long time. If this became an issue,
we could adjust the safekeeper to delete segments from local disk sooner, as soon as they're uploaded to S3, rather than waiting for
remote_consistent_lsn to advance.
For a managed service, the general approach should be to make sure we are monitoring & respond fast enough
that control plane outages are bounded in time.
There is also the fact that control plane runs in a single region.
The latency for distant regions is not a big concern for us because all request types added by this RFC are either infrequent or not in the way of the data path.
However, we lose region isolation for the operations listed above.
The ongoing work to split console and control will give us per-region control plane, and all operations in this RFC can be handled by these per-region control planes.
With that in mind, we accept the trade-offs outlined in this paragraph.
We will also implement an "escape hatch" config generation numbers, where in a major disaster outage,
we may manually run pageservers with a hand-selected generation number, so that we can bring them online
independently of a control plane.
### Rollout
Although there is coupling between components, we may deploy most of the new data plane components
independently of the control plane: initially they can just use a static generation number.
#### Phase 1
The pageserver is deployed with some special config to:
- Always act like everything is generation 1 and do not wait for a control plane issued generation on attach
- Skip the places in deletion and remote_consistent_lsn updates where we would call into control plane
#### Phase 2
The control plane changes are deployed: control plane will now track and increment generation numbers.
#### Phase 3
The pageserver is deployed with its control-plane-dependent changes enabled: it will now require
the control plane to service re-attach requests on startup, and handle generation
validation requests.
### On-disk backward compatibility
Backward compatibility with existing data is straightforward:
- When reading the index, we may assume that any layer whose metadata doesn't include
generations will have a path without generation suffix.
- When locating the index file on attachment, we may use the "fallback" listing path
and if there is only an index without generation suffix, that is the one we load.
It is not necessary to re-write existing layers: even new index files will be able
to represent generation-less layers.
### On-disk forward compatibility
We will do a two phase rollout, probably over multiple releases because we will naturally
have some of the read-side code ready before the overall functionality is ready:
1. Deploy pageservers which understand the new index format and generation suffixes
in keys, but do not write objects with generation numbers in the keys.
2. Deploy pageservers that write objects with generation numbers in the keys.
Old pageservers will be oblivious to generation numbers. That means that they can't
read objects with generation numbers in the name. This is why we must
first step must deploy the ability to read, before the second step
starts writing them.
# Frequently Asked Questions
## Why a generation _suffix_ rather than _prefix_?
The choice is motivated by object listing, since one can list by prefix but not
suffix.
In [finding remote indices](#finding-the-remote-indices-for-timelines), we rely
on being able to do a prefix listing for `<tenant>/<timeline>/index_part.json*`.
That relies on the prefix listing.
The converse case of using a generation prefix and listing by generation is
not needed: one could imagine listing by generation while scrubbing (so that
a particular generation's layers could be scrubbed), but this is not part
of normal operations, and the [scrubber](#cleaning-up-orphan-objects-scrubbing) probably won't work that way anyway.
## Wouldn't it be simpler to have a separate deletion queue per timeline?
Functionally speaking, we could. That's how RemoteTimelineClient currently works,
but this approach does not map well to a long-lived persistent queue with
generation validation.
Anything we do per-timeline generates tiny random I/O, on a pageserver with
tens of thousands of timelines operating: to be ready for high scale, we should:
- A) Amortize costs where we can (e.g. a shared deletion queue)
- B) Expect to put tenants into a quiescent state while they're not
busy: i.e. we shouldn't keep a tenant alive to service its deletion queue.
This was discussed in the [scope](#scope) part of the deletion queue section.
# Appendix A: Examples of use in high availability/failover
The generation numbers proposed in this RFC are adaptable to a variety of different
failover scenarios and models. The sections below sketch how they would work in practice.
### In-place restart of a pageserver
"In-place" here means that the restart is done before any other element in the system
has taken action in response to the node being down.
- After restart, the node issues a re-attach request to the control plane, and
receives new generation numbers for all its attached tenants.
- Tenants may be activated with the generation number in the re-attach response.
- If any of its attachments were in fact stale (i.e. had be reassigned to another
node while this node was offline), then
- the re-attach response will inform the tenant about this by not including
the tenant of this by _not_ incrementing the generation for that attachment.
- This will implicitly block deletions in the tenant, but as an optimization
the pageserver should also proactively stop doing S3 uploads when it notices this stale-generation state.
- The control plane is expected to eventually detach this tenant from the
pageserver.
If the control plane does not include a tenant in the re-attach response,
but there is still local state for the tenant in the filesystem, the pageserver
deletes the local state in response and does not load/active the tenant.
See the [earlier section on pageserver startup](#pageserver-attachstartup-changes) for details.
Control plane can use this mechanism to clean up a pageserver that has been
down for so long that all its tenants were migrated away before it came back
up again and asked for re-attach.
### Failure of a pageserver
In this context, read "failure" as the most ambiguous possible case, where
a pageserver is unavailable to clients and control plane, but may still be executing and talking
to S3.
#### Case A: re-attachment to other nodes
1. Let's say node 0 becomes unresponsive in a cluster of three nodes 0, 1, 2.
2. Some external mechanism notices that the node is unavailable and initiates
movement of all tenants attached to that node to a different node according
to some distribution rule.
In this example, it would mean incrementing the generation
of all tenants that were attached to node 0, as each tenant's assigned pageserver changes.
3. A tenant which is now attached to node 1 will _also_ still be attached to node
0, from the perspective of node 0. Node 0 will still be using its old generation,
node 1 will be using a newer generation.
4. S3 writes will continue from nodes 0 and 1: there will be an index_part.json-00000001
\_and\* an index_part.json-00000002. Objects written under the old suffix
after the new attachment was created do not matter from the rest of the system's
perspective: the endpoints are reading from the new attachment location. Objects
written by node 0 are just garbage that can be cleaned up at leisure. Node 0 will
not do any deletions because it can't synchronize with control plane, or if it could,
its deletion queue processing would get errors for the validation requests.
#### Case B: direct node replacement with same node_id and drive
This is the scenario we would experience if running pageservers in some dynamic
VM/container environment that would auto-replace a given node_id when it became
unresponsive, with the node's storage supplied by some network block device
that is attached to the replacement VM/container.
1. Let's say node 0 fails, and there may be some other peers but they aren't relevant.
2. Some external mechanism notices that the node is unavailable, and creates
a "new node 0" (Node 0b) which is a physically separate server. The original node 0
(Node 0a) may still be running, because we do not assume the environment fences nodes.
3. On startup, node 0b re-attaches and gets higher generation numbers for
all tenants.
4. S3 writes continue from nodes 0a and 0b, but the writes do not collide due to different
generation in the suffix, and the writes from node 0a are not visible to the rest
of the system because endpoints are reading only from node 0b.
# Appendix B: interoperability with other features
## Sharded Keyspace
The design in this RFC maps neatly to a sharded keyspace design where subsets of the key space
for a tenant are assigned to different pageservers:
- the "unit of work" for attachments becomes something like a TenantShard rather than a Tenant
- TenantShards get generation numbers just as Tenants do.
- Write workload (ingest, compaction) for a tenant is spread out across pageservers via
TenantShards, but each TenantShard still has exactly one valid writer at a time.
## Read replicas
_This section is about a passive reader of S3 pageserver state, not a postgres
read replica_
For historical reads to LSNs below the remote persistent LSN, any node may act as a reader at any
time: remote data is logically immutable data, and the use of deferred deletion in this RFC helps
mitigate the fact that remote data is not _physically_ immutable (i.e. the actual data for a given
page moves around as compaction happens).
A read replica needs to be aware of generations in remote data in order to read the latest
metadata (find the index_part.json with the latest suffix). It may either query this
from the control plane, or find it with ListObjectsv2 request
## Seamless migration
To make tenant migration totally seamless, we will probably want to intentionally double-attach
a tenant briefly, serving reads from the old node while waiting for the new node to be ready.
This RFC enables that double-attachment: two nodes may be attached at the same time, with the migration destination
having a higher generation number. The old node will be able to ingest and serve reads, but not
do any deletes. The new node's attachment must also avoid deleting layers that the old node may
still use. A new piece of state
will be needed for this in the control plane's definition of an attachment.
## Warm secondary locations
To enable faster tenant movement after a pageserver is lost, we will probably want to spend some
disk capacity on keeping standby locations populated with local disk data.
There's no conflict between this RFC and that: implementing warm secondary locations on a per-tenant basis
would be a separate change to the control plane to store standby location(s) for a tenant. Because
the standbys do not write to S3, they do not need to be assigned generation numbers. When a tenant is
re-attached to a standby location, that would increment the tenant attachment generation and this
would work the same as any other attachment change, but with a warm cache.
## Ephemeral node IDs
This RFC intentionally avoids changing anything fundamental about how pageservers are identified
and registered with the control plane, to avoid coupling the implementation of pageserver split
brain protection with more fundamental changes in the management of the pageservers.
Moving to ephemeral node IDs would provide an extra layer of
resilience in the system, as it would prevent the control plane
accidentally attaching to two physical nodes with the same
generation, if somehow there were two physical nodes with
the same node IDs (currently we rely on EC2 guarantees to
eliminate this scenario). With ephemeral node IDs, there would be
no possibility of that happening, no matter the behavior of
underlying infrastructure.
Nothing fundamental in the pageserver's handling of generations needs to change to handle ephemeral node IDs, since we hardly use the
`node_id` anywhere. The `/re-attach` API would be extended
to enable the pageserver to obtain its ephemeral ID, and provide
some correlation identifier (e.g. EC instance ID), to help the
control plane re-attach tenants to the same physical server that
previously had them attached.

View File

@@ -1,316 +0,0 @@
This is a copy from the [original Notion page](https://www.notion.so/neondatabase/Proposal-Pageserver-MVCC-S3-Storage-8a424c0c7ec5459e89d3e3f00e87657c?pvs=4), taken on 2023-08-16.
This is for archival mostly.
The RFC that we're likely to go with is https://github.com/neondatabase/neon/pull/4919.
---
# Proposal: Pageserver MVCC S3 Storage
tl;dr: this proposal enables Control Plane to attach a tenant to a new pageserver without being 100% certain that it has been detached from the old pageserver. This enables us to automate failover if a pageserver dies (no human in the loop).
# Problem Statement
The current Neon architecture requires the Control Plane to guarantee that a tenant is only attached to one pageserver at a time. If a tenant is attached to multiple pageservers simultaneously, the pageservers will overwrite each others changes in S3 for that tenant, resulting in data loss for that tenant.
The above imposes limitations on tenant relocation and future designs for high availability. For instance, Control Plane cannot relocate a tenant to another pageserver before it is 100% certain that the tenant is detached from the source pageserver. If the source pageserver is unresponsive, the tenant detach procedure cannot proceed, and Control Plane has no choice but to wait for either the source to become responsive again, or rely on a node failure detection mechanism to detect that the source pageserver is dead, and give permission to skip the detachment step. Either way, the tenant is unavailable for an extended period, and we have no means to improve it in the current architecture.
Note that there is no 100% correct node failure detection mechanism, and even techniques to accelerate failure detection, such as ********************************shoot-the-other-node-in-the-head,******************************** have their limits. So, we currently rely on humans as node failure detectors: they get alerted via PagerDuty, assess the situation under high stress, and make the decision. If they make the wrong call, or the apparent dead pageserver somehow resurrects later, well have data loss.
Also, by relying on humans, were [incurring needless unscalable toil](https://sre.google/sre-book/eliminating-toil/): as Neon grows, pageserver failures will become more and more frequent because our fleet grows. Each instance will need quick response time to minimize downtime for the affected tenants, which implies higher toil, higher resulting attrition, and/or higher personnel cost.
Lastly, there are foreseeable needs by operation and product such as zero-downtime relocation and automatic failover/HA. For such features, the ability to have a tenant purposefully or accidentally attached to more than one pageserver will greatly reduce risk of data loss, and improve availability.
# High-Level Idea
The core idea is to evolve the per-Tenant S3 state to an MVCC-like scheme, allowing multiple pageservers to operate on the same tenant S3 state without interference. To make changes to S3, pageservers acquire long-running transactions from Control Plane. After opening a transaction, Pageservers make PUTs directly against S3, but they keys include the transaction ID, so overwrites never happen. Periodically, pageservers talk back to Control Plane to commit their transaction. This is where Control Plane enforces strict linearizability, favoring availability over work-conservation: commit is only granted if no transaction started after the one thats requesting commit. Garbage collection is done through deadlists, and its simplified tremendously by above commit grant/reject policy.
Minimal changes are required for safekeepers to allow WAL for a single timeline be consumed by more than one pageserver without premature truncation.
**Above scheme makes it safe to attach tenants without a 100% correct node failure detection mechanism. Further, it makes it safe to interleave tenant-attachment to pageservers, unlocking new capabilities for (internal) product features:**
- **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**: if a pageserver is not reachable (network partition, hardware failure, overload) we want to spread its attached tenants to new pageservers to restore availability, within the range of *seconds*. We cannot afford gracious timeouts to maximize the probability that the unreachable pageserver has ceased writing to S3. This proposal enables us to attach the tenants to the replacement pageservers, and redirect their computes, without having to wait for confirmation that the unreachable pageserver has ceased writing to S3.
- **************************************Zero-Downtime Relocation:************************************** we want to be able to relocate tenants to different pageservers with minimized availability or a latency impact. This proposal enables us to attach the relocating Tenant to the destination Pageserver before detaching it from the source Pageserver. This can help minimize downtime because we can wait for the destination to catch up on WAL processing before redirecting Computes.
# Design
The core idea is to evolve the per-Tenant S3 state to a per-tenant MVCC-like scheme.
To make S3 changes for a given tenant, Pageserver requests a transaction ID from control plane for that tenant. Without a transaction ID, Pageserver does not write to S3.
Once Pageserver received a transaction ID it is allowed to produce new objects and overwrite objects created in this transaction. Pageserver is not allowed to delete any objects; instead, it marks the object as deleted by appending the key to the transactions deadlist for later deletion. Commits of transactions are serialized through Control Plane: when Pageserver wants to commit a transaction, it sends an RPC to Control Plane. Control Plane responds with a commit grant or commit reject message. Commit grant means that the transactions changes are now visible to subsequent transactions. Commit reject means that the transactions changes are not and never will be visible to another Pageserver instance, and the rejected Pageserver is to cease further activity on that tenant.
## ****************************************************Commit grant/reject policy****************************************************
For the purposes of Pageserver, we want **linearizability** of a tenants S3 state. Since our transactions are scoped per tenant, it is sufficient for linearizability to grant commit if and only if no other transaction has been started since the commit-requesting transaction started.
For example, consider the case of a single tenant, attached to Pageserver A. Pageserver A has an open transaction but becomes unresponsive. Control Plane decides to relocate the tenant to another Pageserver B. It need *not* wait for A to be 100%-certainly down before B can start uploading to S3 for that tenant. Instead, B can start a new transaction right away, make progress, and get commit grants; What about A? The transaction is RejectPending in Control Plane until A eventually becomes responsive again, tries to commit, gets a rejection, acknowledges it, and thus its transaction becomes RejectAcknowledge. If A is definitively dead, operator can also force-transition from state RejectPending to RejectAcknowledged. But critically, Control Plane doesnt have for As transaction to become RejectAcknowledge before attaching the tenant to B.
```mermaid
sequenceDiagram
participant CP
participant A
participant S3
participant B
CP -->> A: attach tenant
activate A
A -->> CP: start txn
CP -->> A: txn=23, last_committed_txn=22
Note over CP,A: network partition
CP --x A: heartbeat
CP --x A: heartbeat
Note over CP: relocate tenant to avoid downtime
CP -->> B: attach tenant
activate B
B -->> CP: start txn
Note over CP: mark A's txn 23 as RejectPending
CP -->> B: txn=24, last-committed txn is 22
B -->> S3: PUT X.layer.24<br>PUT index_part.json.24 referencing X.layer.24
B -->> CP: request commit
CP -->> B: granted
B -->> CP: start txn
CP -->> B: txn=25, last_committed_txn=22
A -->> S3: PUT Y.layer.23 <br> PUT index_part.json.23 referencing Y.layer.23
A --x CP: request commit
A --x CP: request commit
Note over CP,A: partition is over
A -->> CP: request commit
Note over CP: most recently started txn is 25, not 23, reject
CP -->> A: reject
A -->> CP: acknowledge reject
Note over CP: mark A's txn 23 as RejectAcknowledged
deactivate A
B -->> S3: PUT 000-FFF_X-Y.layer.**************25**************<br>...
deactivate B
```
If a Pageserver gets a rejection to a commit request, it acknowledges rejection and cedes further S3 uploads for the tenant, until it receives a `/detach` request for the tenant (control plane has most likely attached the tenant to another pageserver in the meantime).
In practice, Control Plane will probably extend the commit grant/reject schema above, taking into account the pageserver to which it last attached the tenant. In the above example, Control Plane could remember that the pageserver that is supposed to host the tenant is pageserver B, and reject start-txn and commit requests from pageserver A. It would also use such requests from A as a signal that A is reachable again, and retry the `/detach` .
<aside>
💡 A commit failure causes the tenant to become effectively `Broken`. Pageserver should persist this locally so it doesnt bother ControlPlane for a new txn when Pageserver is restarted.
</aside>
## ********************Visibility********************
We mentioned earlier that once a transaction commits, its changes are visible to subsequent transactions. But how does a given transaction know where to look for the data? There is no longer a single `index_part.json` per timeline, or a single `timelines/:timeline_id` prefix to look for; theyre all multi-versioned, suffixed by the txn number.
The solution is: at transaction start, Pageserver receives the last-committed transaction ID from Control Plane (`last_committed_txn` in the diagram). last_commited_txn is the upper bound for what is visible for the current transaction. Control Plane keeps track of each open transactions last_committed_txn for purposes of garbage collection (see later paragraph).
Equipped with last_committed_txn, Pageserver then discovers
- the current index part of a timeline at `tenants/:tenant_id/timelines/:timeline_id/index_part.json.$last_committed_txn`. The `index_part.json.$last_committed_txn` has the exact same contents as the current architectures index_part.json, i.e. full list of layers.
- the list of existent timelines as part of the `attach` RPC from CP;
There is no other S3 state per tenant, so, thats all the visibility required.
An alternative to receiving the list of existent timelines from CP is to introduce a proper **********SetOfTimelines********** object in S3, and multi-version it just like above. For example, we could have a `tenants/:tenant_id/timelines.json.$txn` file that references `index_part.json.$last_committed_txn` . It can be added later if more separation between CP and PS is desired.
So, the only MVCCed object types in this proposal are LayerFile and IndexPart (=individual timeline), but not the SetOfTimelines in a given tenant. Is this a problem? For example, the Pageservers garbage collection code needs to know the full set of timelines of a tenant. Otherwise itll make incorrect decisions. What if Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer thats still needed for branch T. Not a problem with this propsoal, because the effect of GC (i.e., layer deletion) is properly MVCCed.
## Longevity Of Transactions & Availability
Pageserver depends on Control Plane to start a new transaction. If ControlPlane is down, no new transactions can be started.
Pageservers commit transactions based on a maximum amount of uncommitted changes that have accumulated in S3. A lower maximum increases dependence and load on ControlPlane which decreases availability. A higher maximum risks losing more work in the event of failover; the work will have to be re-done in a new transaction on the new node.
Pageservers are persist the open txn id in local storage, so that they can resume the transaction after restart, without dependence on Control Plane.
## **Operations**
********PUTs:********
- **layer files**
- current architecture: layer files are supposed to be write-once, but actually, there are edge-cases where we PUT the same layer file name twice; namely if we PUT the file to S3 but crash before uploading the index part that references it; then detach + attach, and re-run compaction, which is non-deterministic.
- this proposal: with transactions, we can now upload layers and index_part.json concurrently, just need to make sure layer file upload is done before we request txn commit.
- **index part** upload: `index_part.json.$txn` may be created and subsequently overwritten multiple times in a transaction; it is an availability/work-loss trade-off how often to request a commit from CP.
**************DELETEs**************: for deletion, we maintain a deadlist per transaction. It is located at `tenants/:tenant_id/deadlist/deadlist.json.$txn`. It is PUT once before the pageserver requests requests commit, and not changed after sending request to commit. An object created in the current txn need not (but can) be on the deadlist — it can be DELETEd immediately because its not visible to other transactions. An example use case would be an L0 layer that gets compacted within one transaction; or, if we ever start MVCCing the set of timelines of a tenant, a short-lived branch that is created & destroyed within one transaction.
<aside>
**Deadlist Invariant:** if a an object is on a deadlist of transaction T, it is not referenced from anywhere else in the full state visible to T or any later started transaction > T.
</aside>
### Rationale For Deadlist.json
Given that this proposal only MVCCs layers and indexparts, one may ask why the deadlist isnt part of indexpart. The reason is to not lose generality: the deadlist is just a list of keys; it is not necessary to understand the data format of the versioned object to process the deadlist. This is important for garbage collection / vacuuming, which well come to in the next section.
## Garbage Collection / Vacuuming
After a transaction has reached reject-acknowledged state, Control Plane initiates a garbage collection procedure for the aborted transaction.
Control Plane is in the unique position about transaction states. Here is a sketch of the exact transaction states and what Control Plane keeps track of.
```
struct Tenant {
...
txns: HashMap<TxnId, Transaction>,
// the most recently started txn's id; only most recently sarted can win
next_winner_txn: Option<TxnId>,
}
struct Transaction {
id: TxnId, // immutable
last_committed_txn: TxnId, // immutable; the most recent txn in state `Committed`
// when self was started
pageserver_id: PageserverId,
state: enum {
Open,
Committed,
RejectPending,
RejectAcknowledged, // invariant: we know all S3 activity has ceded
GarbageCollected,
}
}
```
Object creations & deletions by a rejected transaction have never been visible to other transactions. That is true for both RejectPending and RejectAcknowledged states. The difference is that, in RejectPending, the pageserver may still be uploading to S3, whereas in RejectAcknowledged, Control Plane can be certain that all S3 activity in the name of that transaction has ceded. So, once a transaction reaches state RejectAcknowledged state, it is safe to DELETE all objects created by that transaction, and discard the transactions deadlists.
A transaction T in state Committed has subsequent transactions that may or may not reference the objects it created. None of the subsequent transaction can reference the objects on Ts deadlist, though, as per the Deadlist Invariant (see previous section).
So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged:
- Commited: delete objects on the deadlist.
- We dont need a LIST request here, the deadlist is sufficient. So, its really cheap.
- This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they dont matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection theyre destined to get anyways. 404s for RejectAcknowledged is handled below.
- RejectAcknowledged: delete all objects created in that txn, and discard deadlists.
- 404s / object-already-deleted type messages must be expected because of Committed garbage collection (see above)
- How to get this list of objects created in a txn? Open but solvable design question; Ideas:
- **Brute force**: within tenant prefix, search for all keys ending in `.$txn` and delete them.
- **WAL for PUTs**: before a txn PUTs an object, it logs to S3, or some other equivalently durable storage, that its going to do it. If we log to S3, this means we have to do an additional WAL PUT per “readl” PUT.
- ******************************LIST with reorged S3 layout (preferred one right now):****************************** layout S3 key space such that `$txn` comes first, i.e., `tenants/:tenant_id/$txn/timelines/:timeline_id/*.json.$txn` . That way, when we need to GC a RejectAcknowledged txn, we just LIST the entire `tenants/:tenant_id/$txn` prefix and delete it. The cost of GC for RejectAcknowledged transactions is thus proportional to the number of objects created in that transaction.
## Branches
This proposal only MVCCs layer files and and index_part.json, but leaves the tenant object not-MVCCed. We argued earlier that its fine to ignore this for now, because
1. Control Plane can act as source-of-truth for the set of timelines, and
2. The only operation that makes decision based on “set of timelines” is GC, which in turn only does layer deletions, and layer deletions ***are*** properly MVCCed.
Now that weve introduced garbage collection, lets elaborate a little more on (2). Recall our example from earlier: Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer thats still needed for branch T.
How does the MVCCing of layer files protect us here? If A decides to delete that layer, its just on As transactions deadlist, but still present in S3 and usable by B. If A commits first, B wont be able to commit and the layers in timeline T will be vacuumed. If B commits first, As deadlist is discarded and the layer continues to exist.
## Safekeeper Changes
We need to teach the safekeepers that there can be multiple pageservers requesting WAL for the same timeline, in order to prevent premature WAL truncation.
In the current architecture, the Safekeeper service currently assumes only one Pageserver and is allowed to prune WAL older than that Pageservers `remote_consistent_lsn`. Safekeeper currently learns the `remote_consistent_lsn` through the walreceiver protocol.
So, if we have a tenant attached to two pageservers at the same time, they will both try to stream WAL and the Safekeeper will get confused about which connections `remote_consistent_lsn` to use as a basis for WAL pruning.
What do we need to change to make it work? We need to make sure that the Safekeepers only prune WAL up to the `remote_consistent_lsn` of the last-committed transaction.
The straight-forward way to get it is to re-design WAL pruning as follows:
1. Pageserver reports remote_consistent_lsn as part of transaction commit to Control Plane.
2. Control Plane makes sure transaction state update is persisted.
3. Control Plane (asynchronous to transaction commit) reconciles with Safekeepers to ensure WAL pruning happens.
The above requires non-trivial changes, but, in the light of other planned projects such as restore-tenant-from-safekeeper-wal-backups, I think Control Plane will need to get involved in WAL pruning anyways.
# How This Proposal Unlocks Future Features
Let us revisit the example from the introduction where we were thinking about handling network partitions. Network partitions need to be solved first, because theyre unavoidable in distributed systems. We did that. Now lets see how we can solve actual product problems:
## **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**
The “Problem Statement” section outlined the current architectures problems with regards to network partitions or instance failure: it requires a 100% correct node-dead detector to make decisions, which doesnt exist in reality. We rely instead on human toil: an oncall engineer has to inspect the situation and make a decision, which may be incorrect and in any case take time in the order of minutes, which means equivalent downtime for users.
With this proposal, automatic failover for pageservers is trivial:
If a pageserver is unresponsive from Control Planes / Computes perspective, Control Plane does the following:
- attach all tenants of the unresponsive pageserver to new pageservers
- switch over these tenants computes immediately;
At this point, availability is restored and user pain relieved.
Whats left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above.
2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT):
1. Inspect the instance, investigate logs, understand root cause.
2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC.
3. Use below procedure to decomission pageserver.
### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive)
The solution, enabled by this proposal:
1. Ensure that pageservers S3 credentials are revoked so that it cannot make new uploads, which wouldnt be tracked anywhere.
2. Let enough time pass for the S3 credential revocation to propagate. Amazon doesnt give a guarantee here. As stated earlier, we can easily afford to wait here.
3. Mark all Open and RejectPending transactions of that pageserver as RejectAcknowledge.
Revocation of the S3 credentials is required so that, once we transition all the transactions of that pageserver to RejectAcknowledge, once garbage-collection pass is guaranteed to delete all objects that will ever exist for that pageserver. That way, we need not check *****GarbageCollected***** transactions every again.
## Workflow: Zero-Downtime Relocation
With zero-downtime relocation, the goal is to have the target pageserver warmed up, i.e., at the same `last_record_lsn` as the source pageserver, before switching over Computes from source to target pageserver.
With this proposal, it works like so:
1. Grant source pageserver its last open transaction. This one is doomed to be rejected later, unless the relocation fails.
2. Grant target pageserver its first open transaction.
3. Have target pageserver catch up on WAL, streaming from last-committed-txns remote_consistent_lsn onwards.
4. Once target pageserver reports `last_record_lsn` close enough to source pageserver, target pageserver requests commit.
5. Drain compute traffic from source to target pageserver. (Source can still answer requests until it tries to commit and gets reject, so, this will be quite smooth).
Note that as soon as we complete step (4), the source pageservers transaction is doomed to be rejected later. Conversely, if the target cant catch up fast enough, the source will make a transaction commit earlier. This will generally happen if there is a lot of write traffic coming in. The design space to make thing smooth here is large, but well explored in other areas of computing, e.g., VM live migration. We have all the important policy levers at hand, e.g.,
- delaying source commits if we see target making progress
- slowing down source consumption (need some signalling mechanism for it)
- slowing down compute wal generation
-
It doesnt really matter, whats important is that two pageservers can overlap.
# Additional Trade-Offs / Remarks Brought Up During Peer Review
This proposal was read by and discussed @Stas and @Dmitry Rodionov prior to publishing it with the broader team. (This does not mean they endorse this proposal!).
Issues that we discussed:
1. **Frequency of transactions:** If even idle tenants commit every 10min or so, thats quite a lot of load on Control Plane. Can we minimize it by Equating Transaction Commit Period to Attachment Period? I.e. start txn on attach, commit on detach?
1. Would be nice, but, if a tenant is attached for 1 month, then PS dies, we lose 1 month of work.
2. ⇒ my solution to this problem: Adjusted this proposal to make transaction commit frequency proportional to amount of uncommitted data.
1. Its ok to spend resources on active users, they pay us money to do it!
2. The amount of work per transaction is minimal.
1. In current Control Plane, its a small database transaction that is super unlikely to conflict with other transactions.
2. I have very little concerns about scalability of the commit workload on CP side because it's trivially horizontally scalable by sharding by tenant.
3. There's no super stringent availability requirement on control plane; if a txn can't commit because it can't reach the CP, PS can continue & retry in the background, speculating that it's CP downtime and not PS-partitioned-off scenario.
4. Without stringent availability requirement, there's flexibility for future changes to CP-side-implementation.
2. ************************************************Does this proposal address mirroring / no-performance-degradation failover ?************************************************
1. No it doesnt. It only provides the building block for attaching a tenant to a new pageserver without having to worry that the tenant is detached on the old pageserver.
2. A simple scheme to build no-performance-degradation failover on top of this proposal is to have an asynchronous read-only replica of a tenant on another pageserver in the same region.
3. Another more ambitious scheme to get no-performance-degradation would be [One-Pager: Layer File Spreading (Christian)](https://www.notion.so/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=21); this proposal would be used in layer file spreading for risk-free automation of TenantLeader failover, which hasnt been addressed Ithere.
4. In any way, failover would restart from an older S3 state, and need to re-ingest WAL before being able to server recently written pages.
1. Is that a show-stopper? I think not.
2. Is it suboptimal? Absolutely: if a pageserver instance fails, all its tenants will be distributed among the remaining pageservers (OK), and all these tenants will ask the safekeepers for WAL at the same time (BAD). So, pageserver instance failure will cause a load spike in safekeepers.
1. Personally I think thats an OK trade-off to make.
2. There are countless options to avoid / mitigate the load spike. E.g., pro-actively streaming WAL to the standby read-only replica.
3. ********************************************Does this proposal allow multiple writers for a tenant?********************************************
1. In abstract terms, this proposal provides a linearized history for a given S3 prefix.
2. In concrete terms, this proposal provides a linearized history per tenant.
3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history.
4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************
1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT
2. @Dmitry Rodionov :
3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment.

View File

@@ -31,8 +31,6 @@ fn lsn_invalid() -> Lsn {
#[serde_as]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SkTimelineInfo {
/// Term.
pub term: Option<u64>,
/// Term of the last entry.
pub last_log_term: Option<u64>,
/// LSN of the last record.
@@ -60,6 +58,4 @@ pub struct SkTimelineInfo {
/// A connection string to use for WAL receiving.
#[serde(default)]
pub safekeeper_connstr: Option<String>,
#[serde(default)]
pub http_connstr: Option<String>,
}

View File

@@ -26,7 +26,6 @@ serde_json.workspace = true
signal-hook.workspace = true
thiserror.workspace = true
tokio.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-error.workspace = true
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
@@ -38,7 +37,6 @@ url.workspace = true
uuid.workspace = true
pq_proto.workspace = true
postgres_connection.workspace = true
metrics.workspace = true
workspace_hack.workspace = true

View File

@@ -1,31 +1,18 @@
use std::fmt::{Debug, Display};
use futures::Future;
use tokio_util::sync::CancellationToken;
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
pub async fn exponential_backoff(
n: u32,
base_increment: f64,
max_seconds: f64,
cancel: &CancellationToken,
) {
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
let backoff_duration_seconds =
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
if backoff_duration_seconds > 0.0 {
tracing::info!(
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
);
drop(
tokio::time::timeout(
std::time::Duration::from_secs_f64(backoff_duration_seconds),
cancel.cancelled(),
)
.await,
)
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
}
}
@@ -37,57 +24,28 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
}
}
/// Configure cancellation for a retried operation: when to cancel (the token), and
/// what kind of error to return on cancellation
pub struct Cancel<E, CF>
where
E: Display + Debug + 'static,
CF: Fn() -> E,
{
token: CancellationToken,
on_cancel: CF,
}
impl<E, CF> Cancel<E, CF>
where
E: Display + Debug + 'static,
CF: Fn() -> E,
{
pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
Self { token, on_cancel }
}
}
/// retries passed operation until one of the following conditions are met:
/// Encountered error is considered as permanent (non-retryable)
/// Retries have been exhausted.
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
/// When attempts cross `warn_threshold` function starts to emit log warnings.
/// `description` argument is added to log messages. Its value should identify the `op` is doing
/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
/// to drop out promptly on shutdown.
pub async fn retry<T, O, F, E, CF>(
pub async fn retry<T, O, F, E>(
mut op: O,
is_permanent: impl Fn(&E) -> bool,
warn_threshold: u32,
max_retries: u32,
description: &str,
cancel: Cancel<E, CF>,
) -> Result<T, E>
where
// Not std::error::Error because anyhow::Error doesnt implement it.
// For context see https://github.com/dtolnay/anyhow/issues/63
E: Display + Debug + 'static,
E: Display + Debug,
O: FnMut() -> F,
F: Future<Output = Result<T, E>>,
CF: Fn() -> E,
{
let mut attempts = 0;
loop {
if cancel.token.is_cancelled() {
return Err((cancel.on_cancel)());
}
let result = op().await;
match result {
Ok(_) => {
@@ -122,7 +80,6 @@ where
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
&cancel.token,
)
.await;
attempts += 1;
@@ -175,7 +132,6 @@ mod tests {
1,
1,
"work",
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
)
.await;
@@ -201,7 +157,6 @@ mod tests {
2,
2,
"work",
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
)
.await
.unwrap();
@@ -224,7 +179,6 @@ mod tests {
2,
2,
"work",
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
)
.await
.unwrap_err();

View File

@@ -58,8 +58,6 @@ pub mod serde_regex;
pub mod pageserver_feedback;
pub mod postgres_client;
pub mod tracing_span_assert;
pub mod rate_limit;
@@ -70,6 +68,44 @@ pub mod completion;
/// Reporting utilities
pub mod error;
mod failpoint_macro_helpers {
/// use with fail::cfg("$name", "return(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
/// specified time (in milliseconds). The main difference is that we use async
/// tokio sleep function. Another difference is that we print lines to the log,
/// which can be useful in tests to check that the failpoint was hit.
#[macro_export]
macro_rules! failpoint_sleep_millis_async {
($name:literal) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_sleep_helper($name, duration_str).await
}
}};
}
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", name);
}
}
pub use failpoint_macro_helpers::failpoint_sleep_helper;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -1,37 +0,0 @@
//! Postgres client connection code common to other crates (safekeeper and
//! pageserver) which depends on tenant/timeline ids and thus not fitting into
//! postgres_connection crate.
use anyhow::Context;
use postgres_connection::{parse_host_port, PgConnectionConfig};
use crate::id::TenantTimelineId;
/// Create client config for fetching WAL from safekeeper on particular timeline.
/// listen_pg_addr_str is in form host:\[port\].
pub fn wal_stream_connection_config(
TenantTimelineId {
tenant_id,
timeline_id,
}: TenantTimelineId,
listen_pg_addr_str: &str,
auth_token: Option<&str>,
availability_zone: Option<&str>,
) -> anyhow::Result<PgConnectionConfig> {
let (host, port) =
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
let port = port.unwrap_or(5432);
let mut connstr = PgConnectionConfig::new_host_port(host, port)
.extend_options([
"-c".to_owned(),
format!("timeline_id={}", timeline_id),
format!("tenant_id={}", tenant_id),
])
.set_password(auth_token.map(|s| s.to_owned()));
if let Some(availability_zone) = availability_zone {
connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
}
Ok(connstr)
}

View File

@@ -1,31 +0,0 @@
[package]
name = "vm_monitor"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[[bin]]
name = "vm-monitor"
path = "./src/bin/monitor.rs"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow.workspace = true
axum.workspace = true
clap.workspace = true
futures.workspace = true
inotify.workspace = true
serde.workspace = true
serde_json.workspace = true
sysinfo.workspace = true
tokio.workspace = true
tokio-postgres.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[target.'cfg(target_os = "linux")'.dependencies]
cgroups-rs = "0.3.3"

View File

@@ -1,34 +0,0 @@
# `vm-monitor`
The `vm-monitor` (or just monitor) is a core component of the autoscaling system,
along with the `autoscale-scheduler` and the `autoscaler-agent`s. The monitor has
two primary roles: 1) notifying agents when immediate upscaling is necessary due
to memory conditions and 2) managing Postgres' file cache and a cgroup to carry
out upscaling and downscaling decisions.
## More on scaling
We scale CPU and memory using NeonVM, our in-house QEMU tool for use with Kubernetes.
To control thresholds for receiving memory usage notifications, we start Postgres
in the `neon-postgres` cgroup and set its `memory.{max,high}`.
* See also: [`neondatabase/autoscaling`](https://github.com/neondatabase/autoscaling/)
* See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
where initial development of the monitor happened. The repository is no longer
maintained but the commit history may be useful for debugging.
## Structure
The `vm-monitor` is loosely comprised of a few systems. These are:
* the server: this is just a simple `axum` server that accepts requests and
upgrades them to websocket connections. The server only allows one connection at
a time. This means that upon receiving a new connection, the server will terminate
and old one if it exists.
* the filecache: a struct that allows communication with the Postgres file cache.
On startup, we connect to the filecache and hold on to the connection for the
entire monitor lifetime.
* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
listening for `memory.high` events and setting its `memory.{high,max}` values.
* the runner: the runner marries the filecache and cgroup watcher together,
communicating with the agent throught the `Dispatcher`, and then calling filecache
and cgroup watcher functions as needed to upscale and downscale

View File

@@ -1,33 +0,0 @@
// We expose a standalone binary _and_ start the monitor in `compute_ctl` so that
// we can test the monitor as part of the entire autoscaling system in
// neondatabase/autoscaling.
//
// The monitor was previously started by vm-builder, and for testing purposes,
// we can mimic that setup with this binary.
#[cfg(target_os = "linux")]
#[tokio::main]
async fn main() -> anyhow::Result<()> {
use clap::Parser;
use tokio_util::sync::CancellationToken;
use tracing_subscriber::EnvFilter;
use vm_monitor::Args;
let subscriber = tracing_subscriber::fmt::Subscriber::builder()
.json()
.with_file(true)
.with_line_number(true)
.with_span_list(true)
.with_env_filter(EnvFilter::from_default_env())
.finish();
tracing::subscriber::set_global_default(subscriber)?;
let args: &'static Args = Box::leak(Box::new(Args::parse()));
let token = CancellationToken::new();
vm_monitor::start(args, token).await
}
#[cfg(not(target_os = "linux"))]
fn main() {
panic!("the monitor requires cgroups, which are only available on linux")
}

View File

@@ -1,693 +0,0 @@
use std::{
fmt::{Debug, Display},
fs,
pin::pin,
sync::atomic::{AtomicU64, Ordering},
};
use anyhow::{anyhow, bail, Context};
use cgroups_rs::{
freezer::FreezerController,
hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
memory::MemController,
MaxValue,
Subsystem::{Freezer, Mem},
};
use inotify::{EventStream, Inotify, WatchMask};
use tokio::sync::mpsc::{self, error::TryRecvError};
use tokio::time::{Duration, Instant};
use tokio_stream::{Stream, StreamExt};
use tracing::{info, warn};
use crate::protocol::Resources;
use crate::MiB;
/// Monotonically increasing counter of the number of memory.high events
/// the cgroup has experienced.
///
/// We use this to determine if a modification to the `memory.events` file actually
/// changed the `high` field. If not, we don't care about the change. When we
/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
/// to see if it changed since last time.
pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
/// Monotonically increasing counter that gives each cgroup event a unique id.
///
/// This allows us to answer questions like "did this upscale arrive before this
/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
/// with a sequence number. As such, prefer to used the `Sequenced` type rather
/// than this static directly.
static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
/// A memory event type reported in memory.events.
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub enum MemoryEvent {
Low,
High,
Max,
Oom,
OomKill,
OomGroupKill,
}
impl MemoryEvent {
fn as_str(&self) -> &str {
match self {
MemoryEvent::Low => "low",
MemoryEvent::High => "high",
MemoryEvent::Max => "max",
MemoryEvent::Oom => "oom",
MemoryEvent::OomKill => "oom_kill",
MemoryEvent::OomGroupKill => "oom_group_kill",
}
}
}
impl Display for MemoryEvent {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
/// Configuration for a `CgroupWatcher`
#[derive(Debug, Clone)]
pub struct Config {
// The target difference between the total memory reserved for the cgroup
// and the value of the cgroup's memory.high.
//
// In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
// use (equal to system memory, minus whatever's taken out for the file cache).
oom_buffer_bytes: u64,
// The amount of memory, in bytes, below a proposed new value for
// memory.high that the cgroup's memory usage must be for us to downscale
//
// In other words, we can downscale only when:
//
// memory.current + memory_high_buffer_bytes < (proposed) memory.high
//
// TODO: there's some minor issues with this approach -- in particular, that we might have
// memory in use by the kernel's page cache that we're actually ok with getting rid of.
pub(crate) memory_high_buffer_bytes: u64,
// The maximum duration, in milliseconds, that we're allowed to pause
// the cgroup for while waiting for the autoscaler-agent to upscale us
max_upscale_wait: Duration,
// The required minimum time, in milliseconds, that we must wait before re-freezing
// the cgroup while waiting for the autoscaler-agent to upscale us.
do_not_freeze_more_often_than: Duration,
// The amount of memory, in bytes, that we should periodically increase memory.high
// by while waiting for the autoscaler-agent to upscale us.
//
// This exists to avoid the excessive throttling that happens when a cgroup is above its
// memory.high for too long. See more here:
// https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
memory_high_increase_by_bytes: u64,
// The period, in milliseconds, at which we should repeatedly increase the value
// of the cgroup's memory.high while we're waiting on upscaling and memory.high
// is still being hit.
//
// Technically speaking, this actually serves as a rate limit to moderate responding to
// memory.high events, but these are roughly equivalent if the process is still allocating
// memory.
memory_high_increase_every: Duration,
}
impl Config {
/// Calculate the new value for the cgroups memory.high based on system memory
pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
total_system_mem.saturating_sub(self.oom_buffer_bytes)
}
}
impl Default for Config {
fn default() -> Self {
Self {
oom_buffer_bytes: 100 * MiB,
memory_high_buffer_bytes: 100 * MiB,
// while waiting for upscale, don't freeze for more than 20ms every 1s
max_upscale_wait: Duration::from_millis(20),
do_not_freeze_more_often_than: Duration::from_millis(1000),
// while waiting for upscale, increase memory.high by 10MiB every 25ms
memory_high_increase_by_bytes: 10 * MiB,
memory_high_increase_every: Duration::from_millis(25),
}
}
}
/// Used to represent data that is associated with a certain point in time, such
/// as an upscale request or memory.high event.
///
/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
/// a unique sequence number. Sequence numbers are monotonically increasing,
/// allowing us to answer questions like "did this upscale happen after this
/// memory.high event?" by comparing the sequence numbers of the two events.
#[derive(Debug, Clone)]
pub struct Sequenced<T> {
seqnum: u64,
data: T,
}
impl<T> Sequenced<T> {
pub fn new(data: T) -> Self {
Self {
seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
data,
}
}
}
/// Responds to `MonitorEvents` to manage the cgroup: preventing it from being
/// OOM killed or throttling.
///
/// The `CgroupWatcher` primarily achieves this by reading from a stream of
/// `MonitorEvent`s. See `main_signals_loop` for details on how to keep the
/// cgroup happy.
#[derive(Debug)]
pub struct CgroupWatcher {
pub config: Config,
/// The sequence number of the last upscale.
///
/// If we receive a memory.high event that has a _lower_ sequence number than
/// `last_upscale_seqnum`, then we know it occured before the upscale, and we
/// can safely ignore it.
///
/// Note: Like the `events` field, this doesn't _need_ interior mutability but we
/// use it anyways so that methods take `&self`, not `&mut self`.
last_upscale_seqnum: AtomicU64,
/// A channel on which we send messages to request upscale from the dispatcher.
upscale_requester: mpsc::Sender<()>,
/// The actual cgroup we are watching and managing.
cgroup: cgroups_rs::Cgroup,
}
/// Read memory.events for the desired event type.
///
/// `path` specifies the path to the desired `memory.events` file.
/// For more info, see the `memory.events` section of the [kernel docs]
/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
let contents = fs::read_to_string(path)
.with_context(|| format!("failed to read memory.events from {path}"))?;
// Then contents of the file look like:
// low 42
// high 101
// ...
contents
.lines()
.filter_map(|s| s.split_once(' '))
.find(|(e, _)| *e == event.as_str())
.ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
.and_then(|(_, count)| {
count
.parse::<u64>()
.with_context(|| format!("failed to parse memory.{event} as u64"))
})
}
/// Create an event stream that produces events whenever the file at the provided
/// path is modified.
fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
info!("creating file watcher for {path}");
let inotify = Inotify::init().context("failed to initialize file watcher")?;
inotify
.watches()
.add(path, WatchMask::MODIFY)
.with_context(|| format!("failed to start watching {path}"))?;
inotify
// The inotify docs use [0u8; 1024] so we'll just copy them. We only need
// to store one event at a time - if the event gets written over, that's
// ok. We still see that there is an event. For more information, see:
// https://man7.org/linux/man-pages/man7/inotify.7.html
.into_event_stream([0u8; 1024])
.context("failed to start inotify event stream")
}
impl CgroupWatcher {
/// Create a new `CgroupWatcher`.
#[tracing::instrument(skip_all, fields(%name))]
pub fn new(
name: String,
// A channel on which to send upscale requests
upscale_requester: mpsc::Sender<()>,
) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
// TODO: clarify exactly why we need v2
// Make sure cgroups v2 (aka unified) are supported
if !is_cgroup2_unified_mode() {
anyhow::bail!("cgroups v2 not supported");
}
let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
// Start monitoring the cgroup for memory events. In general, for
// cgroups v2 (aka unified), metrics are reported in files like
// > `/sys/fs/cgroup/{name}/{metric}`
// We are looking for `memory.high` events, which are stored in the
// file `memory.events`. For more info, see the `memory.events` section
// of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
let memory_events = create_file_watcher(&path)
.with_context(|| format!("failed to create event watcher for {path}"))?
// This would be nice with with .inspect_err followed by .ok
.filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
Ok(high) => Some(high),
Err(error) => {
// TODO: Might want to just panic here
warn!(?error, "failed to read high events count from {}", &path);
None
}
})
// Only report the event if the memory.high count increased
.filter_map(|high| {
if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
Some(high)
} else {
None
}
})
.map(Sequenced::new);
let initial_count = get_event_count(
&format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
MemoryEvent::High,
)?;
info!(initial_count, "initial memory.high event count");
// Hard update `MEMORY_EVENT_COUNT` since there could have been processes
// running in the cgroup before that caused it to be non-zero.
MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
Ok((
Self {
cgroup,
upscale_requester,
last_upscale_seqnum: AtomicU64::new(0),
config: Default::default(),
},
memory_events,
))
}
/// The entrypoint for the `CgroupWatcher`.
#[tracing::instrument(skip_all)]
pub async fn watch<E>(
&self,
// These are ~dependency injected~ (fancy, I know) because this function
// should never return.
// -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
// -> therefore: if we want to stick it in an Arc so many threads can access
// it, methods can never take mutable access.
// - note: we use the Arc strategy so that a) we can call this function
// right here and b) the runner can call the set/get_memory methods
// -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
// we just pass them in here instead of holding them in fields, as that
// would require this method to take &mut self.
mut upscales: mpsc::Receiver<Sequenced<Resources>>,
events: E,
) -> anyhow::Result<()>
where
E: Stream<Item = Sequenced<u64>>,
{
// There are several actions might do when receiving a `memory.high`,
// such as freezing the cgroup, or increasing its `memory.high`. We don't
// want to do these things too often (because postgres needs to run, and
// we only have so much memory). These timers serve as rate limits for this.
let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
let mut events = pin!(events);
// Are we waiting to be upscaled? Could be true if we request upscale due
// to a memory.high event and it does not arrive in time.
let mut waiting_on_upscale = false;
loop {
tokio::select! {
upscale = upscales.recv() => {
let Sequenced { seqnum, data } = upscale
.context("failed to listen on upscale notification channel")?;
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
}
event = events.next() => {
let Some(Sequenced { seqnum, .. }) = event else {
bail!("failed to listen for memory.high events")
};
// The memory.high came before our last upscale, so we consider
// it resolved
if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
info!(
"received memory.high event, but it came before our last upscale -> ignoring it"
);
continue;
}
// The memory.high came after our latest upscale. We don't
// want to do anything yet, so peek the next event in hopes
// that it's an upscale.
if let Some(upscale_num) = self
.upscaled(&mut upscales)
.context("failed to check if we were upscaled")?
{
if upscale_num > seqnum {
info!(
"received memory.high event, but it came before our last upscale -> ignoring it"
);
continue;
}
}
// If it's been long enough since we last froze, freeze the
// cgroup and request upscale
if wait_to_freeze.is_elapsed() {
info!("received memory.high event -> requesting upscale");
waiting_on_upscale = self
.handle_memory_high_event(&mut upscales)
.await
.context("failed to handle upscale")?;
wait_to_freeze
.as_mut()
.reset(Instant::now() + self.config.do_not_freeze_more_often_than);
continue;
}
// Ok, we can't freeze, just request upscale
if !waiting_on_upscale {
info!("received memory.high event, but too soon to refreeze -> requesting upscale");
// Make check to make sure we haven't been upscaled in the
// meantine (can happen if the agent independently decides
// to upscale us again)
if self
.upscaled(&mut upscales)
.context("failed to check if we were upscaled")?
.is_some()
{
info!("no need to request upscaling because we got upscaled");
continue;
}
self.upscale_requester
.send(())
.await
.context("failed to request upscale")?;
continue;
}
// Shoot, we can't freeze or and we're still waiting on upscale,
// increase memory.high to reduce throttling
if wait_to_increase_memory_high.is_elapsed() {
info!(
"received memory.high event, \
but too soon to refreeze and already requested upscale \
-> increasing memory.high"
);
// Make check to make sure we haven't been upscaled in the
// meantine (can happen if the agent independently decides
// to upscale us again)
if self
.upscaled(&mut upscales)
.context("failed to check if we were upscaled")?
.is_some()
{
info!("no need to increase memory.high because got upscaled");
continue;
}
// Request upscale anyways (the agent will handle deduplicating
// requests)
self.upscale_requester
.send(())
.await
.context("failed to request upscale")?;
let memory_high =
self.get_high_bytes().context("failed to get memory.high")?;
let new_high = memory_high + self.config.memory_high_increase_by_bytes;
info!(
current_high_bytes = memory_high,
new_high_bytes = new_high,
"updating memory.high"
);
self.set_high_bytes(new_high)
.context("failed to set memory.high")?;
wait_to_increase_memory_high
.as_mut()
.reset(Instant::now() + self.config.memory_high_increase_every)
}
// we can't do anything
}
};
}
}
/// Handle a `memory.high`, returning whether we are still waiting on upscale
/// by the time the function returns.
///
/// The general plan for handling a `memory.high` event is as follows:
/// 1. Freeze the cgroup
/// 2. Start a timer for `self.config.max_upscale_wait`
/// 3. Request upscale
/// 4. After the timer elapses or we receive upscale, thaw the cgroup.
/// 5. Return whether or not we are still waiting for upscale. If we are,
/// we'll increase the cgroups memory.high to avoid getting oom killed
#[tracing::instrument(skip_all)]
async fn handle_memory_high_event(
&self,
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
) -> anyhow::Result<bool> {
// Immediately freeze the cgroup before doing anything else.
info!("received memory.high event -> freezing cgroup");
self.freeze().context("failed to freeze cgroup")?;
// We'll use this for logging durations
let start_time = Instant::now();
// Await the upscale until we have to unfreeze
let timed =
tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
// Request the upscale
info!(
wait = ?self.config.max_upscale_wait,
"sending request for immediate upscaling",
);
self.upscale_requester
.send(())
.await
.context("failed to request upscale")?;
let waiting_on_upscale = match timed.await {
Ok(Ok(())) => {
info!(elapsed = ?start_time.elapsed(), "received upscale in time");
false
}
// **important**: unfreeze the cgroup before ?-reporting the error
Ok(Err(e)) => {
info!("error waiting for upscale -> thawing cgroup");
self.thaw()
.context("failed to thaw cgroup after errored waiting for upscale")?;
Err(e.context("failed to await upscale"))?
}
Err(_) => {
info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
true
}
};
info!("thawing cgroup");
self.thaw().context("failed to thaw cgroup")?;
Ok(waiting_on_upscale)
}
/// Checks whether we were just upscaled, returning the upscale's sequence
/// number if so.
#[tracing::instrument(skip_all)]
fn upscaled(
&self,
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
) -> anyhow::Result<Option<u64>> {
let Sequenced { seqnum, data } = match upscales.try_recv() {
Ok(upscale) => upscale,
Err(TryRecvError::Empty) => return Ok(None),
Err(TryRecvError::Disconnected) => {
bail!("upscale notification channel was disconnected")
}
};
// Make sure to update the last upscale sequence number
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
Ok(Some(seqnum))
}
/// Await an upscale event, discarding any `memory.high` events received in
/// the process.
///
/// This is used in `handle_memory_high_event`, where we need to listen
/// for upscales in particular so we know if we can thaw the cgroup early.
#[tracing::instrument(skip_all)]
async fn await_upscale(
&self,
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
) -> anyhow::Result<()> {
let Sequenced { seqnum, .. } = upscales
.recv()
.await
.context("error listening for upscales")?;
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
Ok(())
}
/// Get the cgroup's name.
pub fn path(&self) -> &str {
self.cgroup.path()
}
}
/// Represents a set of limits we apply to a cgroup to control memory usage.
///
/// Setting these values also affects the thresholds for receiving usage alerts.
#[derive(Debug)]
pub struct MemoryLimits {
high: u64,
max: u64,
}
impl MemoryLimits {
pub fn new(high: u64, max: u64) -> Self {
Self { max, high }
}
}
// Methods for manipulating the actual cgroup
impl CgroupWatcher {
/// Get a handle on the freezer subsystem.
fn freezer(&self) -> anyhow::Result<&FreezerController> {
if let Some(Freezer(freezer)) = self
.cgroup
.subsystems()
.iter()
.find(|sub| matches!(sub, Freezer(_)))
{
Ok(freezer)
} else {
anyhow::bail!("could not find freezer subsystem")
}
}
/// Attempt to freeze the cgroup.
pub fn freeze(&self) -> anyhow::Result<()> {
self.freezer()
.context("failed to get freezer subsystem")?
.freeze()
.context("failed to freeze")
}
/// Attempt to thaw the cgroup.
pub fn thaw(&self) -> anyhow::Result<()> {
self.freezer()
.context("failed to get freezer subsystem")?
.thaw()
.context("failed to thaw")
}
/// Get a handle on the memory subsystem.
///
/// Note: this method does not require `self.memory_update_lock` because
/// getting a handle to the subsystem does not access any of the files we
/// care about, such as memory.high and memory.events
fn memory(&self) -> anyhow::Result<&MemController> {
if let Some(Mem(memory)) = self
.cgroup
.subsystems()
.iter()
.find(|sub| matches!(sub, Mem(_)))
{
Ok(memory)
} else {
anyhow::bail!("could not find memory subsystem")
}
}
/// Get cgroup current memory usage.
pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
Ok(self
.memory()
.context("failed to get memory subsystem")?
.memory_stat()
.usage_in_bytes)
}
/// Set cgroup memory.high threshold.
pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
self.memory()
.context("failed to get memory subsystem")?
.set_mem(cgroups_rs::memory::SetMemory {
low: None,
high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
min: None,
max: None,
})
.context("failed to set memory.high")
}
/// Set cgroup memory.high and memory.max.
pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
info!(
limits.high,
limits.max,
path = self.path(),
"writing new memory limits",
);
self.memory()
.context("failed to get memory subsystem while setting memory limits")?
.set_mem(cgroups_rs::memory::SetMemory {
min: None,
low: None,
high: Some(MaxValue::Value(
u64::min(limits.high, i64::MAX as u64) as i64
)),
max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
})
.context("failed to set memory limits")
}
/// Given some amount of available memory, set the desired cgroup memory limits
pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
let new_high = self.config.calculate_memory_high_value(available_memory);
let limits = MemoryLimits::new(new_high, available_memory);
info!(
path = self.path(),
memory = ?limits,
"setting cgroup memory",
);
self.set_limits(&limits)
.context("failed to set cgroup memory limits")?;
Ok(())
}
/// Get memory.high threshold.
pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
let high = self
.memory()
.context("failed to get memory subsystem while getting memory statistics")?
.get_mem()
.map(|mem| mem.high)
.context("failed to get memory statistics from subsystem")?;
match high {
Some(MaxValue::Max) => Ok(i64::MAX as u64),
Some(MaxValue::Value(high)) => Ok(high as u64),
None => anyhow::bail!("failed to read memory.high from memory subsystem"),
}
}
}

View File

@@ -1,153 +0,0 @@
//! Managing the websocket connection and other signals in the monitor.
//!
//! Contains types that manage the interaction (not data interchange, see `protocol`)
//! between agent and monitor, allowing us to to process and send messages in a
//! straightforward way. The dispatcher also manages that signals that come from
//! the cgroup (requesting upscale), and the signals that go to the cgroup
//! (notifying it of upscale).
use anyhow::{bail, Context};
use axum::extract::ws::{Message, WebSocket};
use futures::{
stream::{SplitSink, SplitStream},
SinkExt, StreamExt,
};
use tokio::sync::mpsc;
use tracing::info;
use crate::cgroup::Sequenced;
use crate::protocol::{
OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
PROTOCOL_MIN_VERSION,
};
/// The central handler for all communications in the monitor.
///
/// The dispatcher has two purposes:
/// 1. Manage the connection to the agent, sending and receiving messages.
/// 2. Communicate with the cgroup manager, notifying it when upscale is received,
/// and sending a message to the agent when the cgroup manager requests
/// upscale.
#[derive(Debug)]
pub struct Dispatcher {
/// We read agent messages of of `source`
pub(crate) source: SplitStream<WebSocket>,
/// We send messages to the agent through `sink`
sink: SplitSink<WebSocket, Message>,
/// Used to notify the cgroup when we are upscaled.
pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
/// When the cgroup requests upscale it will send on this channel. In response
/// we send an `UpscaleRequst` to the agent.
pub(crate) request_upscale_events: mpsc::Receiver<()>,
/// The protocol version we have agreed to use with the agent. This is negotiated
/// during the creation of the dispatcher, and should be the highest shared protocol
/// version.
///
// NOTE: currently unused, but will almost certainly be used in the futures
// as the protocol changes
#[allow(unused)]
pub(crate) proto_version: ProtocolVersion,
}
impl Dispatcher {
/// Creates a new dispatcher using the passed-in connection.
///
/// Performs a negotiation with the agent to determine the highest protocol
/// version that both support. This consists of two steps:
/// 1. Wait for the agent to sent the range of protocols it supports.
/// 2. Send a protocol version that works for us as well, or an error if there
/// is no compatible version.
pub async fn new(
stream: WebSocket,
notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
request_upscale_events: mpsc::Receiver<()>,
) -> anyhow::Result<Self> {
let (mut sink, mut source) = stream.split();
// Figure out the highest protocol version we both support
info!("waiting for agent to send protocol version range");
let Some(message) = source.next().await else {
bail!("websocket connection closed while performing protocol handshake")
};
let message = message.context("failed to read protocol version range off connection")?;
let Message::Text(message_text) = message else {
// All messages should be in text form, since we don't do any
// pinging/ponging. See nhooyr/websocket's implementation and the
// agent for more info
bail!("received non-text message during proocol handshake: {message:?}")
};
let monitor_range = ProtocolRange {
min: PROTOCOL_MIN_VERSION,
max: PROTOCOL_MAX_VERSION,
};
let agent_range: ProtocolRange = serde_json::from_str(&message_text)
.context("failed to deserialize protocol version range")?;
info!(range = ?agent_range, "received protocol version range");
let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
Ok(version) => {
sink.send(Message::Text(
serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
))
.await
.context("failed to notify agent of negotiated protocol version")?;
version
}
Err(e) => {
sink.send(Message::Text(
serde_json::to_string(&ProtocolResponse::Error(format!(
"Received protocol version range {} which does not overlap with {}",
agent_range, monitor_range
)))
.unwrap(),
))
.await
.context("failed to notify agent of no overlap between protocol version ranges")?;
Err(e).context("error determining suitable protocol version range")?
}
};
Ok(Self {
sink,
source,
notify_upscale_events,
request_upscale_events,
proto_version: highest_shared_version,
})
}
/// Notify the cgroup manager that we have received upscale and wait for
/// the acknowledgement.
#[tracing::instrument(skip_all, fields(?resources))]
pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
self.notify_upscale_events
.send(resources)
.await
.context("failed to send resources and oneshot sender across channel")
}
/// Send a message to the agent.
///
/// Although this function is small, it has one major benefit: it is the only
/// way to send data accross the connection, and you can only pass in a proper
/// `MonitorMessage`. Without safeguards like this, it's easy to accidentally
/// serialize the wrong thing and send it, since `self.sink.send` will take
/// any string.
pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
info!(?message, "sending message");
let json = serde_json::to_string(&message).context("failed to serialize message")?;
self.sink
.send(Message::Text(json))
.await
.context("stream error sending message")
}
}

View File

@@ -1,316 +0,0 @@
//! Logic for configuring and scaling the Postgres file cache.
use std::num::NonZeroU64;
use crate::MiB;
use anyhow::{anyhow, Context};
use tokio_postgres::{types::ToSql, Client, NoTls, Row};
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
/// Manages Postgres' file cache by keeping a connection open.
#[derive(Debug)]
pub struct FileCacheState {
client: Client,
conn_str: String,
pub(crate) config: FileCacheConfig,
/// A token for cancelling spawned threads during shutdown.
token: CancellationToken,
}
#[derive(Debug)]
pub struct FileCacheConfig {
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
/// memory available for the cgroup.
pub(crate) in_memory: bool,
/// The size of the file cache, in terms of the size of the resource it consumes
/// (currently: only memory)
///
/// For example, setting `resource_multipler = 0.75` gives the cache a target size of 75% of total
/// resources.
///
/// This value must be strictly between 0 and 1.
resource_multiplier: f64,
/// The required minimum amount of memory, in bytes, that must remain available
/// after subtracting the file cache.
///
/// This value must be non-zero.
min_remaining_after_cache: NonZeroU64,
/// Controls the rate of increase in the file cache's size as it grows from zero
/// (when total resources equals min_remaining_after_cache) to the desired size based on
/// `resource_multiplier`.
///
/// A `spread_factor` of zero means that all additional resources will go to the cache until it
/// reaches the desired size. Setting `spread_factor` to N roughly means "for every 1 byte added to
/// the cache's size, N bytes are reserved for the rest of the system, until the cache gets to
/// its desired size".
///
/// This value must be >= 0, and must retain an increase that is more than what would be given by
/// `resource_multiplier`. For example, setting `resource_multiplier` = 0.75 but `spread_factor` = 1
/// would be invalid, because `spread_factor` would induce only 50% usage - never reaching the 75%
/// as desired by `resource_multiplier`.
///
/// `spread_factor` is too large if `(spread_factor + 1) * resource_multiplier >= 1`.
spread_factor: f64,
}
impl FileCacheConfig {
pub fn default_in_memory() -> Self {
Self {
in_memory: true,
// 75 %
resource_multiplier: 0.75,
// 640 MiB; (512 + 128)
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
// ensure any increase in file cache size is split 90-10 with 10% to other memory
spread_factor: 0.1,
}
}
pub fn default_on_disk() -> Self {
Self {
in_memory: false,
resource_multiplier: 0.75,
// 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
// memory, the kernel will just evict from its page cache, rather than e.g. killing
// everything.
min_remaining_after_cache: NonZeroU64::new(256 * MiB).unwrap(),
spread_factor: 0.1,
}
}
/// Make sure fields of the config are consistent.
pub fn validate(&self) -> anyhow::Result<()> {
// Single field validity
anyhow::ensure!(
0.0 < self.resource_multiplier && self.resource_multiplier < 1.0,
"resource_multiplier must be between 0.0 and 1.0 exclusive, got {}",
self.resource_multiplier
);
anyhow::ensure!(
self.spread_factor >= 0.0,
"spread_factor must be >= 0, got {}",
self.spread_factor
);
// Check that `resource_multiplier` and `spread_factor` are valid w.r.t. each other.
//
// As shown in `calculate_cache_size`, we have two lines resulting from `resource_multiplier` and
// `spread_factor`, respectively. They are:
//
// `total` `min_remaining_after_cache`
// size = ————————————————————— - —————————————————————————————
// `spread_factor` + 1 `spread_factor` + 1
//
// and
//
// size = `resource_multiplier` × total
//
// .. where `total` is the total resources. These are isomorphic to the typical 'y = mx + b'
// form, with y = "size" and x = "total".
//
// These lines intersect at:
//
// `min_remaining_after_cache`
// ———————————————————————————————————————————————————
// 1 - `resource_multiplier` × (`spread_factor` + 1)
//
// We want to ensure that this value (a) exists, and (b) is >= `min_remaining_after_cache`. This is
// guaranteed when '`resource_multiplier` × (`spread_factor` + 1)' is less than 1.
// (We also need it to be >= 0, but that's already guaranteed.)
let intersect_factor = self.resource_multiplier * (self.spread_factor + 1.0);
anyhow::ensure!(
intersect_factor < 1.0,
"incompatible resource_multipler and spread_factor"
);
Ok(())
}
/// Calculate the desired size of the cache, given the total memory
pub fn calculate_cache_size(&self, total: u64) -> u64 {
// *Note*: all units are in bytes, until the very last line.
let available = total.saturating_sub(self.min_remaining_after_cache.get());
if available == 0 {
return 0;
}
// Conversions to ensure we don't overflow from floating-point ops
let size_from_spread =
i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
let size_from_normal = (total as f64 * self.resource_multiplier) as u64;
let byte_size = u64::min(size_from_spread, size_from_normal);
// The file cache operates in units of mebibytes, so the sizes we produce should
// be rounded to a mebibyte. We round down to be conservative.
byte_size / MiB * MiB
}
}
impl FileCacheState {
/// Connect to the file cache.
#[tracing::instrument(skip_all, fields(%conn_str, ?config))]
pub async fn new(
conn_str: &str,
config: FileCacheConfig,
token: CancellationToken,
) -> anyhow::Result<Self> {
config.validate().context("file cache config is invalid")?;
info!(conn_str, "connecting to Postgres file cache");
let client = FileCacheState::connect(conn_str, token.clone())
.await
.context("failed to connect to postgres file cache")?;
let conn_str = conn_str.to_string();
Ok(Self {
client,
config,
conn_str,
token,
})
}
/// Connect to Postgres.
///
/// Aborts the spawned thread if the kill signal is received. This is not
/// a method as it is called in [`FileCacheState::new`].
#[tracing::instrument(skip_all, fields(%conn_str))]
async fn connect(conn_str: &str, token: CancellationToken) -> anyhow::Result<Client> {
let (client, conn) = tokio_postgres::connect(conn_str, NoTls)
.await
.context("failed to connect to pg client")?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own. See tokio-postgres docs.
crate::spawn_with_cancel(
token,
|res| {
if let Err(error) = res {
error!(%error, "postgres error")
}
},
conn,
);
Ok(client)
}
/// Execute a query with a retry if necessary.
///
/// If the initial query fails, we restart the database connection and attempt
/// if again.
#[tracing::instrument(skip_all, fields(%statement))]
pub async fn query_with_retry(
&mut self,
statement: &str,
params: &[&(dyn ToSql + Sync)],
) -> anyhow::Result<Vec<Row>> {
match self
.client
.query(statement, params)
.await
.context("failed to execute query")
{
Ok(rows) => Ok(rows),
Err(e) => {
error!(error = ?e, "postgres error: {e} -> retrying");
let client = FileCacheState::connect(&self.conn_str, self.token.clone())
.await
.context("failed to connect to postgres file cache")?;
info!("successfully reconnected to postgres client");
// Replace the old client and attempt the query with the new one
self.client = client;
self.client
.query(statement, params)
.await
.context("failed to execute query a second time")
}
}
}
/// Get the current size of the file cache.
#[tracing::instrument(skip_all)]
pub async fn get_file_cache_size(&mut self) -> anyhow::Result<u64> {
self.query_with_retry(
// The file cache GUC variable is in MiB, but the conversion with
// pg_size_bytes means that the end result we get is in bytes.
"SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit'));",
&[],
)
.await
.context("failed to query pg for file cache size")?
.first()
.ok_or_else(|| anyhow!("file cache size query returned no rows"))?
// pg_size_bytes returns a bigint which is the same as an i64.
.try_get::<_, i64>(0)
// Since the size of the table is not negative, the cast is sound.
.map(|bytes| bytes as u64)
.context("failed to extract file cache size from query result")
}
/// Attempt to set the file cache size, returning the size it was actually
/// set to.
#[tracing::instrument(skip_all, fields(%num_bytes))]
pub async fn set_file_cache_size(&mut self, num_bytes: u64) -> anyhow::Result<u64> {
let max_bytes = self
// The file cache GUC variable is in MiB, but the conversion with pg_size_bytes
// means that the end result we get is in bytes.
.query_with_retry(
"SELECT pg_size_bytes(current_setting('neon.max_file_cache_size'));",
&[],
)
.await
.context("failed to query pg for max file cache size")?
.first()
.ok_or_else(|| anyhow!("max file cache size query returned no rows"))?
.try_get::<_, i64>(0)
.map(|bytes| bytes as u64)
.context("failed to extract max file cache size from query result")?;
let max_mb = max_bytes / MiB;
let num_mb = u64::min(num_bytes, max_bytes) / MiB;
let capped = if num_bytes > max_bytes {
" (capped by maximum size)"
} else {
""
};
info!(
size = num_mb,
max = max_mb,
"updating file cache size {capped}",
);
// note: even though the normal ways to get the cache size produce values with trailing "MB"
// (hence why we call pg_size_bytes in `get_file_cache_size`'s query), the format
// it expects to set the value is "integer number of MB" without trailing units.
// For some reason, this *really* wasn't working with normal arguments, so that's
// why we're constructing the query here.
self.client
.query(
&format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb),
&[],
)
.await
.context("failed to change file cache size limit")?;
// must use pg_reload_conf to have the settings change take effect
self.client
.execute("SELECT pg_reload_conf();", &[])
.await
.context("failed to reload config")?;
Ok(num_mb * MiB)
}
}

View File

@@ -1,215 +0,0 @@
#![cfg(target_os = "linux")]
use anyhow::Context;
use axum::{
extract::{ws::WebSocket, State, WebSocketUpgrade},
response::Response,
};
use axum::{routing::get, Router, Server};
use clap::Parser;
use futures::Future;
use std::{fmt::Debug, time::Duration};
use sysinfo::{RefreshKind, System, SystemExt};
use tokio::{sync::broadcast, task::JoinHandle};
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
use runner::Runner;
// Code that interfaces with agent
pub mod dispatcher;
pub mod protocol;
pub mod cgroup;
pub mod filecache;
pub mod runner;
/// The vm-monitor is an autoscaling component started by compute_ctl.
///
/// It carries out autoscaling decisions (upscaling/downscaling) and responds to
/// memory pressure by making requests to the autoscaler-agent.
#[derive(Debug, Parser)]
pub struct Args {
/// The name of the cgroup we should monitor for memory.high events. This
/// is the cgroup that postgres should be running in.
#[arg(short, long)]
pub cgroup: Option<String>,
/// The connection string for the Postgres file cache we should manage.
#[arg(short, long)]
pub pgconnstr: Option<String>,
/// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
/// kernel's page cache), and therefore should not count against available memory.
//
// NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
// than a roundabout way, via whether it's on disk), but in order to be backwards compatible
// during the switch away from an in-memory file cache, we had to default to the previous
// behavior.
#[arg(long)]
pub file_cache_on_disk: bool,
/// The address we should listen on for connection requests. For the
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
#[arg(short, long)]
pub addr: String,
}
impl Args {
pub fn addr(&self) -> &str {
&self.addr
}
}
/// The number of bytes in one mebibyte.
#[allow(non_upper_case_globals)]
const MiB: u64 = 1 << 20;
/// Convert a quantity in bytes to a quantity in mebibytes, generally for display
/// purposes. (Most calculations in this crate use bytes directly)
pub fn bytes_to_mebibytes(bytes: u64) -> f32 {
(bytes as f32) / (MiB as f32)
}
pub fn get_total_system_memory() -> u64 {
System::new_with_specifics(RefreshKind::new().with_memory()).total_memory()
}
/// Global app state for the Axum server
#[derive(Debug, Clone)]
pub struct ServerState {
/// Used to close old connections.
///
/// When a new connection is made, we send a message signalling to the old
/// connection to close.
pub sender: broadcast::Sender<()>,
/// Used to cancel all spawned threads in the monitor.
pub token: CancellationToken,
// The CLI args
pub args: &'static Args,
}
/// Spawn a thread that may get cancelled by the provided [`CancellationToken`].
///
/// This is mainly meant to be called with futures that will be pending for a very
/// long time, or are not mean to return. If it is not desirable for the future to
/// ever resolve, such as in the case of [`cgroup::CgroupWatcher::watch`], the error can
/// be logged with `f`.
pub fn spawn_with_cancel<T, F>(
token: CancellationToken,
f: F,
future: T,
) -> JoinHandle<Option<T::Output>>
where
T: Future + Send + 'static,
T::Output: Send + 'static,
F: FnOnce(&T::Output) + Send + 'static,
{
tokio::spawn(async move {
tokio::select! {
_ = token.cancelled() => {
info!("received global kill signal");
None
}
res = future => {
f(&res);
Some(res)
}
}
})
}
/// The entrypoint to the binary.
///
/// Set up tracing, parse arguments, and start an http server.
pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Result<()> {
// This channel is used to close old connections. When a new connection is
// made, we send a message signalling to the old connection to close.
let (sender, _) = tokio::sync::broadcast::channel::<()>(1);
let app = Router::new()
// This route gets upgraded to a websocket connection. We only support
// one connection at a time, which we enforce by killing old connections
// when we receive a new one.
.route("/monitor", get(ws_handler))
.with_state(ServerState {
sender,
token,
args,
});
let addr = args.addr();
let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
.with_context(|| format!("failed to bind to {addr}"))?;
info!(addr, "server bound");
bound
.serve(app.into_make_service())
.await
.context("server exited")?;
Ok(())
}
/// Handles incoming websocket connections.
///
/// If we are already to connected to an agent, we kill that old connection
/// and accept the new one.
#[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
pub async fn ws_handler(
ws: WebSocketUpgrade,
State(ServerState {
sender,
token,
args,
}): State<ServerState>,
) -> Response {
// Kill the old monitor
info!("closing old connection if there is one");
let _ = sender.send(());
// Start the new one. Wow, the cycle of death and rebirth
let closer = sender.subscribe();
ws.on_upgrade(|ws| start_monitor(ws, args, closer, token))
}
/// Starts the monitor. If startup fails or the monitor exits, an error will
/// be logged and our internal state will be reset to allow for new connections.
#[tracing::instrument(skip_all, fields(?args))]
async fn start_monitor(
ws: WebSocket,
args: &Args,
kill: broadcast::Receiver<()>,
token: CancellationToken,
) {
info!("accepted new websocket connection -> starting monitor");
let timeout = Duration::from_secs(4);
let monitor = tokio::time::timeout(
timeout,
Runner::new(Default::default(), args, ws, kill, token),
)
.await;
let mut monitor = match monitor {
Ok(Ok(monitor)) => monitor,
Ok(Err(error)) => {
error!(?error, "failed to create monitor");
return;
}
Err(_) => {
error!(
?timeout,
"creating monitor timed out (probably waiting to receive protocol range)"
);
return;
}
};
info!("connected to agent");
match monitor.run().await {
Ok(()) => info!("monitor was killed due to new connection"),
Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
}
}

View File

@@ -1,241 +0,0 @@
//! Types representing protocols and actual agent-monitor messages.
//!
//! The pervasive use of serde modifiers throughout this module is to ease
//! serialization on the go side. Because go does not have enums (which model
//! messages well), it is harder to model messages, and we accomodate that with
//! serde.
//!
//! *Note*: the agent sends and receives messages in different ways.
//!
//! The agent serializes messages in the form and then sends them. The use
//! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
//! to determine how to deserialize `Content`.
//! ```ignore
//! struct {
//! Content any
//! Type string
//! Id uint64
//! }
//! ```
//! and receives messages in the form:
//! ```ignore
//! struct {
//! {fields embedded}
//! Type string
//! Id uint64
//! }
//! ```
//! After reading the type field, the agent will decode the entire message
//! again, this time into the correct type using the embedded fields.
//! Because the agent cannot just extract the json contained in a certain field
//! (it initially deserializes to `map[string]interface{}`), we keep the fields
//! at the top level, so the entire piece of json can be deserialized into a struct,
//! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
use core::fmt;
use std::cmp;
use serde::{de::Error, Deserialize, Serialize};
/// A Message we send to the agent.
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct OutboundMsg {
#[serde(flatten)]
pub(crate) inner: OutboundMsgKind,
pub(crate) id: usize,
}
impl OutboundMsg {
pub fn new(inner: OutboundMsgKind, id: usize) -> Self {
Self { inner, id }
}
}
/// The different underlying message types we can send to the agent.
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(tag = "type")]
pub enum OutboundMsgKind {
/// Indicates that the agent sent an invalid message, i.e, we couldn't
/// properly deserialize it.
InvalidMessage { error: String },
/// Indicates that we experienced an internal error while processing a message.
/// For example, if a cgroup operation fails while trying to handle an upscale,
/// we return `InternalError`.
InternalError { error: String },
/// Returned to the agent once we have finished handling an upscale. If the
/// handling was unsuccessful, an `InternalError` will get returned instead.
/// *Note*: this is a struct variant because of the way go serializes struct{}
UpscaleConfirmation {},
/// Indicates to the monitor that we are urgently requesting resources.
/// *Note*: this is a struct variant because of the way go serializes struct{}
UpscaleRequest {},
/// Returned to the agent once we have finished attempting to downscale. If
/// an error occured trying to do so, an `InternalError` will get returned instead.
/// However, if we are simply unsuccessful (for example, do to needing the resources),
/// that gets included in the `DownscaleResult`.
DownscaleResult {
// FIXME for the future (once the informant is deprecated)
// As of the time of writing, the agent/informant version of this struct is
// called api.DownscaleResult. This struct has uppercase fields which are
// serialized as such. Thus, we serialize using uppercase names so we don't
// have to make a breaking change to the agent<->informant protocol. Once
// the informant has been superseded by the monitor, we can add the correct
// struct tags to api.DownscaleResult without causing a breaking change,
// since we don't need to support the agent<->informant protocol anymore.
#[serde(rename = "Ok")]
ok: bool,
#[serde(rename = "Status")]
status: String,
},
/// Part of the bidirectional heartbeat. The heartbeat is initiated by the
/// agent.
/// *Note*: this is a struct variant because of the way go serializes struct{}
HealthCheck {},
}
/// A message received form the agent.
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct InboundMsg {
#[serde(flatten)]
pub(crate) inner: InboundMsgKind,
pub(crate) id: usize,
}
/// The different underlying message types we can receive from the agent.
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(tag = "type", content = "content")]
pub enum InboundMsgKind {
/// Indicates that the we sent an invalid message, i.e, we couldn't
/// properly deserialize it.
InvalidMessage { error: String },
/// Indicates that the informan experienced an internal error while processing
/// a message. For example, if it failed to request upsacle from the agent, it
/// would return an `InternalError`.
InternalError { error: String },
/// Indicates to us that we have been granted more resources. We should respond
/// with an `UpscaleConfirmation` when done handling the resources (increasins
/// file cache size, cgorup memory limits).
UpscaleNotification { granted: Resources },
/// A request to reduce resource usage. We should response with a `DownscaleResult`,
/// when done.
DownscaleRequest { target: Resources },
/// Part of the bidirectional heartbeat. The heartbeat is initiated by the
/// agent.
/// *Note*: this is a struct variant because of the way go serializes struct{}
HealthCheck {},
}
/// Represents the resources granted to a VM.
#[derive(Serialize, Deserialize, Debug, Clone, Copy)]
// Renamed because the agent has multiple resources types:
// `Resources` (milliCPU/memory slots)
// `Allocation` (vCPU/bytes) <- what we correspond to
#[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
pub struct Resources {
/// Number of vCPUs
pub(crate) cpu: f64,
/// Bytes of memory
pub(crate) mem: u64,
}
impl Resources {
pub fn new(cpu: f64, mem: u64) -> Self {
Self { cpu, mem }
}
}
pub const PROTOCOL_MIN_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Ord, Eq, Serialize, Deserialize)]
pub struct ProtocolVersion(u8);
impl ProtocolVersion {
/// Represents v1.0 of the agent<-> monitor protocol - the initial version
///
/// Currently the latest version.
const V1_0: ProtocolVersion = ProtocolVersion(1);
}
impl fmt::Display for ProtocolVersion {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
ProtocolVersion(0) => f.write_str("<invalid: zero>"),
ProtocolVersion::V1_0 => f.write_str("v1.0"),
other => write!(f, "<unknown: {other}>"),
}
}
}
/// A set of protocol bounds that determines what we are speaking.
///
/// These bounds are inclusive.
#[derive(Debug)]
pub struct ProtocolRange {
pub min: ProtocolVersion,
pub max: ProtocolVersion,
}
// Use a custom deserialize impl to ensure that `self.min <= self.max`
impl<'de> Deserialize<'de> for ProtocolRange {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(Deserialize)]
struct InnerProtocolRange {
min: ProtocolVersion,
max: ProtocolVersion,
}
let InnerProtocolRange { min, max } = InnerProtocolRange::deserialize(deserializer)?;
if min > max {
Err(D::Error::custom(format!(
"min version = {min} is greater than max version = {max}",
)))
} else {
Ok(ProtocolRange { min, max })
}
}
}
impl fmt::Display for ProtocolRange {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.min == self.max {
f.write_fmt(format_args!("{}", self.max))
} else {
f.write_fmt(format_args!("{} to {}", self.min, self.max))
}
}
}
impl ProtocolRange {
/// Find the highest shared version between two `ProtocolRange`'s
pub fn highest_shared_version(&self, other: &Self) -> anyhow::Result<ProtocolVersion> {
// We first have to make sure the ranges are overlapping. Once we know
// this, we can merge the ranges by taking the max of the mins and the
// mins of the maxes.
if self.min > other.max {
anyhow::bail!(
"Non-overlapping bounds: other.max = {} was less than self.min = {}",
other.max,
self.min,
)
} else if self.max < other.min {
anyhow::bail!(
"Non-overlappinng bounds: self.max = {} was less than other.min = {}",
self.max,
other.min
)
} else {
Ok(cmp::min(self.max, other.max))
}
}
}
/// We send this to the monitor after negotiating which protocol to use
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub enum ProtocolResponse {
Error(String),
Version(ProtocolVersion),
}

View File

@@ -1,460 +0,0 @@
//! Exposes the `Runner`, which handles messages received from agent and
//! sends upscale requests.
//!
//! This is the "Monitor" part of the monitor binary and is the main entrypoint for
//! all functionality.
use std::sync::Arc;
use std::{fmt::Debug, mem};
use anyhow::{bail, Context};
use axum::extract::ws::{Message, WebSocket};
use futures::StreamExt;
use tokio::sync::broadcast;
use tokio::sync::mpsc;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn};
use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
use crate::dispatcher::Dispatcher;
use crate::filecache::{FileCacheConfig, FileCacheState};
use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};
/// Central struct that interacts with agent, dispatcher, and cgroup to handle
/// signals from the agent.
#[derive(Debug)]
pub struct Runner {
config: Config,
filecache: Option<FileCacheState>,
cgroup: Option<Arc<CgroupWatcher>>,
dispatcher: Dispatcher,
/// We "mint" new message ids by incrementing this counter and taking the value.
///
/// **Note**: This counter is always odd, so that we avoid collisions between the IDs generated
/// by us vs the autoscaler-agent.
counter: usize,
/// A signal to kill the main thread produced by `self.run()`. This is triggered
/// when the server receives a new connection. When the thread receives the
/// signal off this channel, it will gracefully shutdown.
kill: broadcast::Receiver<()>,
}
/// Configuration for a `Runner`
#[derive(Debug)]
pub struct Config {
/// `sys_buffer_bytes` gives the estimated amount of memory, in bytes, that the kernel uses before
/// handing out the rest to userspace. This value is the estimated difference between the
/// *actual* physical memory and the amount reported by `grep MemTotal /proc/meminfo`.
///
/// For more information, refer to `man 5 proc`, which defines MemTotal as "Total usable RAM
/// (i.e., physical RAM minus a few reserved bits and the kernel binary code)".
///
/// We only use `sys_buffer_bytes` when calculating the system memory from the *external* memory
/// size, rather than the self-reported memory size, according to the kernel.
///
/// TODO: this field is only necessary while we still have to trust the autoscaler-agent's
/// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
/// should be removed once we have a better solution there.
sys_buffer_bytes: u64,
}
impl Default for Config {
fn default() -> Self {
Self {
sys_buffer_bytes: 100 * MiB,
}
}
}
impl Runner {
/// Create a new monitor.
#[tracing::instrument(skip_all, fields(?config, ?args))]
pub async fn new(
config: Config,
args: &Args,
ws: WebSocket,
kill: broadcast::Receiver<()>,
token: CancellationToken,
) -> anyhow::Result<Runner> {
anyhow::ensure!(
config.sys_buffer_bytes != 0,
"invalid monitor Config: sys_buffer_bytes cannot be 0"
);
// *NOTE*: the dispatcher and cgroup manager talk through these channels
// so make sure they each get the correct half, nothing is droppped, etc.
let (notified_send, notified_recv) = mpsc::channel(1);
let (requesting_send, requesting_recv) = mpsc::channel(1);
let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
.await
.context("error creating new dispatcher")?;
let mut state = Runner {
config,
filecache: None,
cgroup: None,
dispatcher,
counter: 1, // NB: must be odd, see the comment about the field for more.
kill,
};
let mut file_cache_reserved_bytes = 0;
let mem = get_total_system_memory();
// We need to process file cache initialization before cgroup initialization, so that the memory
// allocated to the file cache is appropriately taken into account when we decide the cgroup's
// memory limits.
if let Some(connstr) = &args.pgconnstr {
info!("initializing file cache");
let config = match args.file_cache_on_disk {
true => FileCacheConfig::default_on_disk(),
false => FileCacheConfig::default_in_memory(),
};
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
.await
.context("failed to create file cache")?;
let size = file_cache
.get_file_cache_size()
.await
.context("error getting file cache size")?;
let new_size = file_cache.config.calculate_cache_size(mem);
info!(
initial = bytes_to_mebibytes(size),
new = bytes_to_mebibytes(new_size),
"setting initial file cache size",
);
// note: even if size == new_size, we want to explicitly set it, just
// to make sure that we have the permissions to do so
let actual_size = file_cache
.set_file_cache_size(new_size)
.await
.context("failed to set file cache size, possibly due to inadequate permissions")?;
if actual_size != new_size {
info!("file cache size actually got set to {actual_size}")
}
// Mark the resources given to the file cache as reserved, but only if it's in memory.
if !args.file_cache_on_disk {
file_cache_reserved_bytes = actual_size;
}
state.filecache = Some(file_cache);
}
if let Some(name) = &args.cgroup {
let (mut cgroup, cgroup_event_stream) =
CgroupWatcher::new(name.clone(), requesting_send)
.context("failed to create cgroup manager")?;
let available = mem - file_cache_reserved_bytes;
cgroup
.set_memory_limits(available)
.context("failed to set cgroup memory limits")?;
let cgroup = Arc::new(cgroup);
// Some might call this . . . cgroup v2
let cgroup_clone = Arc::clone(&cgroup);
spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
cgroup_clone.watch(notified_recv, cgroup_event_stream).await
});
state.cgroup = Some(cgroup);
} else {
// *NOTE*: We need to forget the sender so that its drop impl does not get ran.
// This allows us to poll it in `Monitor::run` regardless of whether we
// are managing a cgroup or not. If we don't forget it, all receives will
// immediately return an error because the sender is droped and it will
// claim all select! statements, effectively turning `Monitor::run` into
// `loop { fail to receive }`.
mem::forget(requesting_send);
}
Ok(state)
}
/// Attempt to downscale filecache + cgroup
#[tracing::instrument(skip_all, fields(?target))]
pub async fn try_downscale(&mut self, target: Resources) -> anyhow::Result<(bool, String)> {
// Nothing to adjust
if self.cgroup.is_none() && self.filecache.is_none() {
info!("no action needed for downscale (no cgroup or file cache enabled)");
return Ok((
true,
"monitor is not managing cgroup or file cache".to_string(),
));
}
let requested_mem = target.mem;
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
let expected_file_cache_mem_usage = self
.filecache
.as_ref()
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
.unwrap_or(0);
let mut new_cgroup_mem_high = 0;
if let Some(cgroup) = &self.cgroup {
new_cgroup_mem_high = cgroup
.config
.calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
let current = cgroup
.current_memory_usage()
.context("failed to fetch cgroup memory")?;
if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
let status = format!(
"{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
"calculated memory.high too low",
bytes_to_mebibytes(new_cgroup_mem_high),
bytes_to_mebibytes(current),
bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
);
info!(status, "discontinuing downscale");
return Ok((false, status));
}
}
// The downscaling has been approved. Downscale the file cache, then the cgroup.
let mut status = vec![];
let mut file_cache_mem_usage = 0;
if let Some(file_cache) = &mut self.filecache {
let actual_usage = file_cache
.set_file_cache_size(expected_file_cache_mem_usage)
.await
.context("failed to set file cache size")?;
if file_cache.config.in_memory {
file_cache_mem_usage = actual_usage;
}
let message = format!(
"set file cache size to {} MiB (in memory = {})",
bytes_to_mebibytes(actual_usage),
file_cache.config.in_memory,
);
info!("downscale: {message}");
status.push(message);
}
if let Some(cgroup) = &self.cgroup {
let available_memory = usable_system_memory - file_cache_mem_usage;
if file_cache_mem_usage != expected_file_cache_mem_usage {
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
}
let limits = MemoryLimits::new(
// new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
// since it is properly initialized in the previous cgroup if let block
new_cgroup_mem_high,
available_memory,
);
cgroup
.set_limits(&limits)
.context("failed to set cgroup memory limits")?;
let message = format!(
"set cgroup memory.high to {} MiB, of new max {} MiB",
bytes_to_mebibytes(new_cgroup_mem_high),
bytes_to_mebibytes(available_memory)
);
info!("downscale: {message}");
status.push(message);
}
// TODO: make this status thing less jank
let status = status.join("; ");
Ok((true, status))
}
/// Handle new resources
#[tracing::instrument(skip_all, fields(?resources))]
pub async fn handle_upscale(&mut self, resources: Resources) -> anyhow::Result<()> {
if self.filecache.is_none() && self.cgroup.is_none() {
info!("no action needed for upscale (no cgroup or file cache enabled)");
return Ok(());
}
let new_mem = resources.mem;
let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
// Get the file cache's expected contribution to the memory usage
let mut file_cache_mem_usage = 0;
if let Some(file_cache) = &mut self.filecache {
let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
info!(
target = bytes_to_mebibytes(expected_usage),
total = bytes_to_mebibytes(new_mem),
"updating file cache size",
);
let actual_usage = file_cache
.set_file_cache_size(expected_usage)
.await
.context("failed to set file cache size")?;
if file_cache.config.in_memory {
file_cache_mem_usage = actual_usage;
}
if actual_usage != expected_usage {
warn!(
"file cache was set to a different size that we wanted: target = {} Mib, actual= {} Mib",
bytes_to_mebibytes(expected_usage),
bytes_to_mebibytes(actual_usage)
)
}
}
if let Some(cgroup) = &self.cgroup {
let available_memory = usable_system_memory - file_cache_mem_usage;
let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
info!(
target = bytes_to_mebibytes(new_cgroup_mem_high),
total = bytes_to_mebibytes(new_mem),
name = cgroup.path(),
"updating cgroup memory.high",
);
let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
cgroup
.set_limits(&limits)
.context("failed to set file cache size")?;
}
Ok(())
}
/// Take in a message and perform some action, such as downscaling or upscaling,
/// and return a message to be send back.
#[tracing::instrument(skip_all, fields(%id, message = ?inner))]
pub async fn process_message(
&mut self,
InboundMsg { inner, id }: InboundMsg,
) -> anyhow::Result<Option<OutboundMsg>> {
match inner {
InboundMsgKind::UpscaleNotification { granted } => {
self.handle_upscale(granted)
.await
.context("failed to handle upscale")?;
self.dispatcher
.notify_upscale(Sequenced::new(granted))
.await
.context("failed to notify notify cgroup of upscale")?;
Ok(Some(OutboundMsg::new(
OutboundMsgKind::UpscaleConfirmation {},
id,
)))
}
InboundMsgKind::DownscaleRequest { target } => self
.try_downscale(target)
.await
.context("failed to downscale")
.map(|(ok, status)| {
Some(OutboundMsg::new(
OutboundMsgKind::DownscaleResult { ok, status },
id,
))
}),
InboundMsgKind::InvalidMessage { error } => {
warn!(
%error, id, "received notification of an invalid message we sent"
);
Ok(None)
}
InboundMsgKind::InternalError { error } => {
warn!(error, id, "agent experienced an internal error");
Ok(None)
}
InboundMsgKind::HealthCheck {} => {
Ok(Some(OutboundMsg::new(OutboundMsgKind::HealthCheck {}, id)))
}
}
}
// TODO: don't propagate errors, probably just warn!?
#[tracing::instrument(skip_all)]
pub async fn run(&mut self) -> anyhow::Result<()> {
info!("starting dispatcher");
loop {
tokio::select! {
signal = self.kill.recv() => {
match signal {
Ok(()) => return Ok(()),
Err(e) => bail!("failed to receive kill signal: {e}")
}
}
// we need to propagate an upscale request
request = self.dispatcher.request_upscale_events.recv() => {
if request.is_none() {
bail!("failed to listen for upscale event from cgroup")
}
info!("cgroup asking for upscale; forwarding request");
self.counter += 2; // Increment, preserving parity (i.e. keep the
// counter odd). See the field comment for more.
self.dispatcher
.send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
.await
.context("failed to send message")?;
}
// there is a message from the agent
msg = self.dispatcher.source.next() => {
if let Some(msg) = msg {
// Don't use 'message' as a key as the string also uses
// that for its key
info!(?msg, "received message");
match msg {
Ok(msg) => {
let message: InboundMsg = match msg {
Message::Text(text) => {
serde_json::from_str(&text).context("failed to deserialize text message")?
}
other => {
warn!(
// Don't use 'message' as a key as the
// string also uses that for its key
msg = ?other,
"agent should only send text messages but received different type"
);
continue
},
};
let out = match self.process_message(message.clone()).await {
Ok(Some(out)) => out,
Ok(None) => continue,
Err(e) => {
let error = e.to_string();
warn!(?error, "error handling message");
OutboundMsg::new(
OutboundMsgKind::InternalError {
error
},
message.id
)
}
};
self.dispatcher
.send(out)
.await
.context("failed to send message")?;
}
Err(e) => warn!("{e}"),
}
} else {
anyhow::bail!("dispatcher connection closed")
}
}
}
}
}
}

View File

@@ -51,7 +51,6 @@ serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
serde_with.workspace = true
signal-hook.workspace = true
smallvec = { workspace = true, features = ["write"] }
svg_fmt.workspace = true
sync_wrapper.workspace = true
tokio-tar.workspace = true

View File

@@ -215,6 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
TimelineId::generate(),
zero.add(10 * i32)..zero.add(10 * i32 + 1),
Lsn(i),
false,
0,
);
updates.insert_historic(layer);

View File

@@ -10,7 +10,7 @@ use std::{fs, path::Path, str};
use pageserver::page_cache::PAGE_SZ;
use pageserver::repository::{Key, KEY_SIZE};
use pageserver::tenant::block_io::FileBlockReader;
use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
use pageserver::tenant::storage_layer::range_overlaps;
@@ -97,7 +97,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
let file = FileBlockReader::new(VirtualFile::open(path)?);
let summary_blk = file.read_blk(0).await?;
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,

View File

@@ -44,11 +44,13 @@ pub(crate) enum LayerCmd {
}
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
use pageserver::tenant::block_io::BlockReader;
let path = path.as_ref();
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path)?);
let summary_blk = file.read_blk(0).await?;
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
@@ -68,7 +70,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
},
)
.await?;
let cursor = BlockCursor::new_fileblockreader_virtual(&file);
let cursor = BlockCursor::new(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos()).await?;
println!("key:{} value_len:{}", k, value.len());

View File

@@ -6,12 +6,11 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::TenantSharedResources;
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;
@@ -122,7 +121,7 @@ fn main() -> anyhow::Result<()> {
}
// Initialize up failpoints support
let scenario = pageserver::failpoint_support::init();
let scenario = FailScenario::setup();
// Basic initialization of things that don't change after startup
virtual_file::init(conf.max_file_descriptors);
@@ -383,10 +382,8 @@ fn start_pageserver(
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
TenantSharedResources {
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
},
broker_client.clone(),
remote_storage.clone(),
order,
))?;

View File

@@ -1,86 +0,0 @@
/// use with fail::cfg("$name", "return(2000)")
///
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
/// specified time (in milliseconds). The main difference is that we use async
/// tokio sleep function. Another difference is that we print lines to the log,
/// which can be useful in tests to check that the failpoint was hit.
#[macro_export]
macro_rules! __failpoint_sleep_millis_async {
($name:literal) => {{
// If the failpoint is used with a "return" action, set should_sleep to the
// returned value (as string). Otherwise it's set to None.
let should_sleep = (|| {
::fail::fail_point!($name, |x| x);
::std::option::Option::None
})();
// Sleep if the action was a returned value
if let ::std::option::Option::Some(duration_str) = should_sleep {
$crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
}
}};
}
pub use __failpoint_sleep_millis_async as sleep_millis_async;
// Helper function used by the macro. (A function has nicer scoping so we
// don't need to decorate everything with "::")
#[doc(hidden)]
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
let millis = duration_str.parse::<u64>().unwrap();
let d = std::time::Duration::from_millis(millis);
tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", name);
}
pub fn init() -> fail::FailScenario<'static> {
// The failpoints lib provides support for parsing the `FAILPOINTS` env var.
// We want non-default behavior for `exit`, though, so, we handle it separately.
//
// Format for FAILPOINTS is "name=actions" separated by ";".
let actions = std::env::var("FAILPOINTS");
if actions.is_ok() {
std::env::remove_var("FAILPOINTS");
} else {
// let the library handle non-utf8, or nothing for not present
}
let scenario = fail::FailScenario::setup();
if let Ok(val) = actions {
val.split(';')
.enumerate()
.map(|(i, s)| s.split_once('=').ok_or((i, s)))
.for_each(|res| {
let (name, actions) = match res {
Ok(t) => t,
Err((i, s)) => {
panic!(
"startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
i + 1,
);
}
};
if let Err(e) = apply_failpoint(name, actions) {
panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
}
});
}
scenario
}
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
if actions == "exit" {
fail::cfg_callback(name, exit_failpoint)
} else {
fail::cfg(name, actions)
}
}
#[inline(never)]
fn exit_failpoint() {
tracing::info!("Exit requested by failpoint");
std::process::exit(1);
}

View File

@@ -517,6 +517,7 @@ async fn timeline_delete_handler(
.instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
.await?;
// FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
json_response(StatusCode::ACCEPTED, ())
}
@@ -979,7 +980,14 @@ async fn failpoints_handler(
// We recognize one extra "action" that's not natively recognized
// by the failpoints crate: exit, to immediately kill the process
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
let cfg_result = if fp.actions == "exit" {
fail::cfg_callback(fp.name, || {
info!("Exit requested by failpoint");
std::process::exit(1);
})
} else {
fail::cfg(fp.name, &fp.actions)
};
if let Err(err_msg) = cfg_result {
return Err(ApiError::BadRequest(anyhow!(

View File

@@ -21,8 +21,6 @@ pub mod walingest;
pub mod walrecord;
pub mod walredo;
pub mod failpoint_support;
use std::path::Path;
use crate::task_mgr::TaskKind;

View File

@@ -6,7 +6,7 @@ use metrics::{
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use strum::{EnumCount, IntoEnumIterator, VariantNames};
use strum::VariantNames;
use strum_macros::{EnumVariantNames, IntoStaticStr};
use utils::id::{TenantId, TimelineId};
@@ -570,160 +570,23 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
#[derive(Debug)]
struct GlobalAndPerTimelineHistogram {
global: Histogram,
per_tenant_timeline: Histogram,
}
const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
"get_rel_exists",
"get_rel_size",
"get_page_at_lsn",
"get_db_size",
];
impl GlobalAndPerTimelineHistogram {
fn observe(&self, value: f64) {
self.global.observe(value);
self.per_tenant_timeline.observe(value);
}
}
struct GlobalAndPerTimelineHistogramTimer<'a> {
h: &'a GlobalAndPerTimelineHistogram,
start: std::time::Instant,
}
impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> {
fn drop(&mut self) {
let elapsed = self.start.elapsed();
self.h.observe(elapsed.as_secs_f64());
}
}
#[derive(
Debug,
Clone,
Copy,
IntoStaticStr,
strum_macros::EnumCount,
strum_macros::EnumIter,
strum_macros::FromRepr,
)]
#[strum(serialize_all = "snake_case")]
pub enum SmgrQueryType {
GetRelExists,
GetRelSize,
GetPageAtLsn,
GetDbSize,
}
#[derive(Debug)]
pub struct SmgrQueryTimePerTimeline {
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
}
static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_smgr_query_seconds",
"Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
"Time spent on smgr query handling",
&["smgr_query_type", "tenant_id", "timeline_id"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_smgr_query_seconds_global",
"Time spent on smgr query handling, aggregated by query type.",
&["smgr_query_type"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
impl SmgrQueryTimePerTimeline {
pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let metrics = std::array::from_fn(|i| {
let op = SmgrQueryType::from_repr(i).unwrap();
let global = SMGR_QUERY_TIME_GLOBAL
.get_metric_with_label_values(&[op.into()])
.unwrap();
let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
.get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
.unwrap();
GlobalAndPerTimelineHistogram {
global,
per_tenant_timeline,
}
});
Self { metrics }
}
pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ {
let metric = &self.metrics[op as usize];
GlobalAndPerTimelineHistogramTimer {
h: metric,
start: std::time::Instant::now(),
}
}
}
#[cfg(test)]
mod smgr_query_time_tests {
use strum::IntoEnumIterator;
use utils::id::{TenantId, TimelineId};
// Regression test, we used hard-coded string constants before using an enum.
#[test]
fn op_label_name() {
use super::SmgrQueryType::*;
let expect: [(super::SmgrQueryType, &'static str); 4] = [
(GetRelExists, "get_rel_exists"),
(GetRelSize, "get_rel_size"),
(GetPageAtLsn, "get_page_at_lsn"),
(GetDbSize, "get_db_size"),
];
for (op, expect) in expect {
let actual: &'static str = op.into();
assert_eq!(actual, expect);
}
}
#[test]
fn basic() {
let ops: Vec<_> = super::SmgrQueryType::iter().collect();
for op in &ops {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
let get_counts = || {
let global: u64 = ops
.iter()
.map(|op| metrics.metrics[*op as usize].global.get_sample_count())
.sum();
let per_tenant_timeline: u64 = ops
.iter()
.map(|op| {
metrics.metrics[*op as usize]
.per_tenant_timeline
.get_sample_count()
})
.sum();
(global, per_tenant_timeline)
};
let (pre_global, pre_per_tenant_timeline) = get_counts();
assert_eq!(pre_per_tenant_timeline, 0);
let timer = metrics.start_timer(*op);
drop(timer);
let (post_global, post_per_tenant_timeline) = get_counts();
assert_eq!(post_per_tenant_timeline, 1);
assert!(post_global > pre_global);
}
}
}
// keep in sync with control plane Go code so that we can validate
// compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
@@ -1182,12 +1045,6 @@ impl Drop for TimelineMetrics {
.write()
.unwrap()
.remove(tenant_id, timeline_id);
// The following metrics are born outside of the TimelineMetrics lifecycle but still
// removed at the end of it. The idea is to have the metrics outlive the
// entity during which they're observed, e.g., the smgr metrics shall
// outlive an individual smgr connection, but not the timeline.
for op in StorageTimeOperation::VARIANTS {
let _ =
STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1199,12 +1056,8 @@ impl Drop for TimelineMetrics {
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
}
for op in SmgrQueryType::iter() {
let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
op.into(),
tenant_id,
timeline_id,
]);
for op in SMGR_QUERY_TIME_OPERATIONS {
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
}
}
}

View File

@@ -10,42 +10,6 @@
//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
//! information about what's stored in the buffer.
//!
//! # Types Of Pages
//!
//! [`PageCache`] only supports immutable pages.
//! Hence there is no need to worry about coherency.
//!
//! Two types of pages are supported:
//!
//! * **Materialized pages**, filled & used by page reconstruction
//! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
//!
//! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
//! It uses the page cache only for the blocks that are already fully written and immutable.
//!
//! # Filling The Page Cache
//!
//! Page cache maps from a cache key to a buffer slot.
//! The cache key uniquely identifies the piece of data that is being cached.
//!
//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
//!
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
//! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
//! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
//! * Get a [`FileId`] using [`next_file_id`].
//! * Use the mechanism to associate the on-disk file with the returned [`FileId`].
//! * Use [`PageCache::read_immutable_buf`] to get a [`ReadBufResult`].
//! * If the page was already cached, it'll be the [`ReadBufResult::Found`] variant that contains
//! a read guard for the page. Just use it.
//! * If the page was not cached, it'll be the [`ReadBufResult::NotFound`] variant that contains
//! a write guard for the page. Fill the page with the contents of the on-disk file.
//! Then call [`PageWriteGuard::mark_valid`] to mark the page as valid.
//! Then try again to [`PageCache::read_immutable_buf`].
//! Unless there's high cache pressure, the page should now be cached.
//! (TODO: allow downgrading the write guard to a read guard to ensure forward progress.)
//!
//! # Locking
//!
//! There are two levels of locking involved: There's one lock for the "mapping"
@@ -75,16 +39,21 @@
use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
sync::{
atomic::{AtomicU8, AtomicUsize, Ordering},
RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
},
};
use anyhow::Context;
use once_cell::sync::OnceCell;
use tracing::error;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::tenant::{block_io, ephemeral_file, writeback_ephemeral_file};
use crate::{metrics::PageCacheSizeMetrics, repository::Key};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -118,17 +87,6 @@ pub fn get() -> &'static PageCache {
pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
const MAX_USAGE_COUNT: u8 = 5;
/// See module-level comment.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub struct FileId(u64);
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
/// See module-level comment.
pub fn next_file_id() -> FileId {
FileId(NEXT_ID.fetch_add(1, Ordering::Relaxed))
}
///
/// CacheKey uniquely identifies a "thing" to cache in the page cache.
///
@@ -139,8 +97,12 @@ enum CacheKey {
hash_key: MaterializedPageHashKey,
lsn: Lsn,
},
EphemeralPage {
file_id: ephemeral_file::FileId,
blkno: u32,
},
ImmutableFilePage {
file_id: FileId,
file_id: block_io::FileId,
blkno: u32,
},
}
@@ -159,13 +121,14 @@ struct Version {
}
struct Slot {
inner: tokio::sync::RwLock<SlotInner>,
inner: RwLock<SlotInner>,
usage_count: AtomicU8,
}
struct SlotInner {
key: Option<CacheKey>,
buf: &'static mut [u8; PAGE_SZ],
dirty: bool,
}
impl Slot {
@@ -200,11 +163,6 @@ impl Slot {
Err(usage_count) => usage_count,
}
}
/// Sets the usage count to a specific value.
fn set_usage_count(&self, count: u8) {
self.usage_count.store(count, Ordering::Relaxed);
}
}
pub struct PageCache {
@@ -217,9 +175,11 @@ pub struct PageCache {
///
/// If you add support for caching different kinds of objects, each object kind
/// can have a separate mapping map, next to this field.
materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
ephemeral_page_map: RwLock<HashMap<(ephemeral_file::FileId, u32), usize>>,
immutable_page_map: RwLock<HashMap<(block_io::FileId, u32), usize>>,
/// The actual buffers with their metadata.
slots: Box<[Slot]>,
@@ -235,7 +195,7 @@ pub struct PageCache {
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
/// until the guard is dropped.
///
pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
impl std::ops::Deref for PageReadGuard<'_> {
type Target = [u8; PAGE_SZ];
@@ -262,10 +222,9 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
/// to initialize.
///
pub struct PageWriteGuard<'i> {
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
inner: RwLockWriteGuard<'i, SlotInner>,
// Are the page contents currently valid?
// Used to mark pages as invalid that are assigned but not yet filled with data.
valid: bool,
}
@@ -299,6 +258,14 @@ impl PageWriteGuard<'_> {
);
self.valid = true;
}
pub fn mark_dirty(&mut self) {
// only ephemeral pages can be dirty ATM.
assert!(matches!(
self.inner.key,
Some(CacheKey::EphemeralPage { .. })
));
self.inner.dirty = true;
}
}
impl Drop for PageWriteGuard<'_> {
@@ -313,6 +280,7 @@ impl Drop for PageWriteGuard<'_> {
let self_key = self.inner.key.as_ref().unwrap();
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
self.inner.key = None;
self.inner.dirty = false;
}
}
}
@@ -340,7 +308,7 @@ impl PageCache {
/// The 'lsn' is an upper bound, this will return the latest version of
/// the given block, but not newer than 'lsn'. Returns the actual LSN of the
/// returned page.
pub async fn lookup_materialized_page(
pub fn lookup_materialized_page(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -360,7 +328,7 @@ impl PageCache {
lsn,
};
if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
if let CacheKey::MaterializedPage {
hash_key: _,
lsn: available_lsn,
@@ -387,7 +355,7 @@ impl PageCache {
///
/// Store an image of the given page in the cache.
///
pub async fn memorize_materialized_page(
pub fn memorize_materialized_page(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
@@ -404,7 +372,7 @@ impl PageCache {
lsn,
};
match self.lock_for_write(&cache_key).await? {
match self.lock_for_write(&cache_key)? {
WriteBufResult::Found(write_guard) => {
// We already had it in cache. Another thread must've put it there
// concurrently. Check that it had the same contents that we
@@ -420,16 +388,80 @@ impl PageCache {
Ok(())
}
// Section 1.2: Public interface functions for working with immutable file pages.
// Section 1.2: Public interface functions for working with Ephemeral pages.
pub async fn read_immutable_buf(
pub fn read_ephemeral_buf(
&self,
file_id: FileId,
file_id: ephemeral_file::FileId,
blkno: u32,
) -> anyhow::Result<ReadBufResult> {
let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
self.lock_for_read(&mut cache_key)
}
pub fn write_ephemeral_buf(
&self,
file_id: ephemeral_file::FileId,
blkno: u32,
) -> anyhow::Result<WriteBufResult> {
let cache_key = CacheKey::EphemeralPage { file_id, blkno };
self.lock_for_write(&cache_key)
}
/// Immediately drop all buffers belonging to given file, without writeback
pub fn drop_buffers_for_ephemeral(&self, drop_file_id: ephemeral_file::FileId) {
for slot_idx in 0..self.slots.len() {
let slot = &self.slots[slot_idx];
let mut inner = slot.inner.write().unwrap();
if let Some(key) = &inner.key {
match key {
CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
// remove mapping for old buffer
self.remove_mapping(key);
inner.key = None;
inner.dirty = false;
}
_ => {}
}
}
}
}
// Section 1.3: Public interface functions for working with immutable file pages.
pub fn read_immutable_buf(
&self,
file_id: block_io::FileId,
blkno: u32,
) -> anyhow::Result<ReadBufResult> {
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
self.lock_for_read(&mut cache_key).await
self.lock_for_read(&mut cache_key)
}
/// Immediately drop all buffers belonging to given file, without writeback
pub fn drop_buffers_for_immutable(&self, drop_file_id: block_io::FileId) {
for slot_idx in 0..self.slots.len() {
let slot = &self.slots[slot_idx];
let mut inner = slot.inner.write().unwrap();
if let Some(key) = &inner.key {
match key {
CacheKey::ImmutableFilePage { file_id, blkno: _ }
if *file_id == drop_file_id =>
{
// remove mapping for old buffer
self.remove_mapping(key);
inner.key = None;
inner.dirty = false;
}
_ => {}
}
}
}
}
//
@@ -449,14 +481,14 @@ impl PageCache {
///
/// If no page is found, returns None and *cache_key is left unmodified.
///
async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
let cache_key_orig = cache_key.clone();
if let Some(slot_idx) = self.search_mapping(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we released the mapping
// lock already, another thread could have evicted the page)
let slot = &self.slots[slot_idx];
let inner = slot.inner.read().await;
let inner = slot.inner.read().unwrap();
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
return Some(PageReadGuard(inner));
@@ -497,11 +529,15 @@ impl PageCache {
/// }
/// ```
///
async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
}
CacheKey::EphemeralPage { .. } => (
&crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
&crate::metrics::PAGE_CACHE.read_hits_ephemeral,
),
CacheKey::ImmutableFilePage { .. } => (
&crate::metrics::PAGE_CACHE.read_accesses_immutable,
&crate::metrics::PAGE_CACHE.read_hits_immutable,
@@ -512,7 +548,7 @@ impl PageCache {
let mut is_first_iteration = true;
loop {
// First check if the key already exists in the cache.
if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
if let Some(read_guard) = self.try_lock_for_read(cache_key) {
if is_first_iteration {
hit.inc();
}
@@ -542,7 +578,8 @@ impl PageCache {
// Make the slot ready
let slot = &self.slots[slot_idx];
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
inner.dirty = false;
slot.usage_count.store(1, Ordering::Relaxed);
return Ok(ReadBufResult::NotFound(PageWriteGuard {
inner,
@@ -555,13 +592,13 @@ impl PageCache {
/// found, returns None.
///
/// When locking a page for writing, the search criteria is always "exact".
async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we don't released the mapping
// lock already, another thread could have evicted the page)
let slot = &self.slots[slot_idx];
let inner = slot.inner.write().await;
let inner = slot.inner.write().unwrap();
if inner.key.as_ref() == Some(cache_key) {
slot.inc_usage_count();
return Some(PageWriteGuard { inner, valid: true });
@@ -574,10 +611,10 @@ impl PageCache {
///
/// Similar to lock_for_read(), but the returned buffer is write-locked and
/// may be modified by the caller even if it's already found in the cache.
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
loop {
// First check if the key already exists in the cache.
if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
if let Some(write_guard) = self.try_lock_for_write(cache_key) {
return Ok(WriteBufResult::Found(write_guard));
}
@@ -603,7 +640,8 @@ impl PageCache {
// Make the slot ready
let slot = &self.slots[slot_idx];
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
inner.dirty = false;
slot.usage_count.store(1, Ordering::Relaxed);
return Ok(WriteBufResult::NotFound(PageWriteGuard {
inner,
@@ -641,6 +679,10 @@ impl PageCache {
*lsn = version.lsn;
Some(version.slot_idx)
}
CacheKey::EphemeralPage { file_id, blkno } => {
let map = self.ephemeral_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
@@ -664,6 +706,10 @@ impl PageCache {
None
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let map = self.ephemeral_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
@@ -697,6 +743,12 @@ impl PageCache {
panic!("could not find old key in mapping")
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let mut map = self.ephemeral_page_map.write().unwrap();
map.remove(&(*file_id, *blkno))
.expect("could not find old key in mapping");
self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
map.remove(&(*file_id, *blkno))
@@ -736,7 +788,17 @@ impl PageCache {
}
}
}
CacheKey::EphemeralPage { file_id, blkno } => {
let mut map = self.ephemeral_page_map.write().unwrap();
match map.entry((*file_id, *blkno)) {
Entry::Occupied(entry) => Some(*entry.get()),
Entry::Vacant(entry) => {
entry.insert(slot_idx);
self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
None
}
}
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
match map.entry((*file_id, *blkno)) {
@@ -758,7 +820,7 @@ impl PageCache {
/// Find a slot to evict.
///
/// On return, the slot is empty and write-locked.
fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
let iter_limit = self.slots.len() * 10;
let mut iters = 0;
loop {
@@ -770,7 +832,10 @@ impl PageCache {
if slot.dec_usage_count() == 0 {
let mut inner = match slot.inner.try_write() {
Ok(inner) => inner,
Err(_err) => {
Err(TryLockError::Poisoned(err)) => {
anyhow::bail!("buffer lock was poisoned: {err:?}")
}
Err(TryLockError::WouldBlock) => {
// If we have looped through the whole buffer pool 10 times
// and still haven't found a victim buffer, something's wrong.
// Maybe all the buffers were in locked. That could happen in
@@ -784,8 +849,25 @@ impl PageCache {
}
};
if let Some(old_key) = &inner.key {
if inner.dirty {
if let Err(err) = Self::writeback(old_key, inner.buf) {
// Writing the page to disk failed.
//
// FIXME: What to do here, when? We could propagate the error to the
// caller, but victim buffer is generally unrelated to the original
// call. It can even belong to a different tenant. Currently, we
// report the error to the log and continue the clock sweep to find
// a different victim. But if the problem persists, the page cache
// could fill up with dirty pages that we cannot evict, and we will
// loop retrying the writebacks indefinitely.
error!("writeback of buffer {:?} failed: {}", old_key, err);
continue;
}
}
// remove mapping for old buffer
self.remove_mapping(old_key);
inner.dirty = false;
inner.key = None;
}
return Ok((slot_idx, inner));
@@ -793,18 +875,39 @@ impl PageCache {
}
}
fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
match cache_key {
CacheKey::MaterializedPage {
hash_key: _,
lsn: _,
} => Err(std::io::Error::new(
std::io::ErrorKind::Other,
"unexpected dirty materialized page",
)),
CacheKey::EphemeralPage { file_id, blkno } => {
writeback_ephemeral_file(*file_id, *blkno, buf)
}
CacheKey::ImmutableFilePage {
file_id: _,
blkno: _,
} => Err(std::io::Error::new(
std::io::ErrorKind::Other,
"unexpected dirty immutable page",
)),
}
}
/// Initialize a new page cache
///
/// This should be called only once at page server startup.
fn new(num_pages: usize) -> Self {
assert!(num_pages > 0, "page cache size must be > 0");
// We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
// memory that Vec's might contain.
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
size_metrics.max_bytes.set_page_sz(num_pages);
size_metrics.current_bytes_ephemeral.set_page_sz(0);
size_metrics.current_bytes_immutable.set_page_sz(0);
size_metrics.current_bytes_materialized_page.set_page_sz(0);
@@ -814,7 +917,11 @@ impl PageCache {
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
Slot {
inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
inner: RwLock::new(SlotInner {
key: None,
buf,
dirty: false,
}),
usage_count: AtomicU8::new(0),
}
})
@@ -822,6 +929,7 @@ impl PageCache {
Self {
materialized_page_map: Default::default(),
ephemeral_page_map: Default::default(),
immutable_page_map: Default::default(),
slots,
next_evict_slot: AtomicUsize::new(0),

View File

@@ -50,8 +50,7 @@ use crate::basebackup;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant;
@@ -307,6 +306,39 @@ async fn page_service_conn_main(
}
}
struct PageRequestMetrics {
get_rel_exists: metrics::Histogram,
get_rel_size: metrics::Histogram,
get_page_at_lsn: metrics::Histogram,
get_db_size: metrics::Histogram,
}
impl PageRequestMetrics {
fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let get_rel_exists =
SMGR_QUERY_TIME.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]);
let get_rel_size =
SMGR_QUERY_TIME.with_label_values(&["get_rel_size", &tenant_id, &timeline_id]);
let get_page_at_lsn =
SMGR_QUERY_TIME.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]);
let get_db_size =
SMGR_QUERY_TIME.with_label_values(&["get_db_size", &tenant_id, &timeline_id]);
Self {
get_rel_exists,
get_rel_size,
get_page_at_lsn,
get_db_size,
}
}
}
struct PageServerHandler {
_conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
@@ -374,7 +406,7 @@ impl PageServerHandler {
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
pgb.flush().await?;
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
loop {
let msg = tokio::select! {
@@ -414,21 +446,21 @@ impl PageServerHandler {
let response = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
let _timer = metrics.get_rel_exists.start_timer();
self.handle_get_rel_exists_request(&timeline, &req, &ctx)
.await
}
PagestreamFeMessage::Nblocks(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
let _timer = metrics.get_rel_size.start_timer();
self.handle_get_nblocks_request(&timeline, &req, &ctx).await
}
PagestreamFeMessage::GetPage(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
let _timer = metrics.get_page_at_lsn.start_timer();
self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
.await
}
PagestreamFeMessage::DbSize(req) => {
let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
let _timer = metrics.get_db_size.start_timer();
self.handle_db_size_request(&timeline, &req, &ctx).await
}
};
@@ -952,8 +984,8 @@ where
false
};
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
&*metrics::BASEBACKUP_QUERY_TIME,
metrics::metric_vec_duration::observe_async_block_duration_by_result(
&*crate::metrics::BASEBACKUP_QUERY_TIME,
async move {
self.handle_basebackup_request(
pgb,

View File

@@ -29,7 +29,6 @@ use std::collections::hash_map::Entry;
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::fmt::Debug;
use std::fmt::Display;
use std::fs;
use std::fs::File;
use std::fs::OpenOptions;
@@ -56,7 +55,6 @@ use self::remote_timeline_client::RemoteTimelineClient;
use self::timeline::uninit::TimelineUninitMark;
use self::timeline::uninit::UninitializedTimeline;
use self::timeline::EvictionTaskTenantState;
use self::timeline::TimelineResources;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir;
@@ -72,6 +70,7 @@ use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
use crate::tenant::storage_layer::DeltaLayer;
use crate::tenant::storage_layer::ImageLayer;
use crate::tenant::storage_layer::Layer;
use crate::InitializationOrder;
use crate::tenant::timeline::delete::DeleteTimelineFlow;
@@ -137,6 +136,9 @@ pub use timeline::{
LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
};
// re-export this function so that page_cache.rs can use it.
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
// re-export for use in remote_timeline_client.rs
pub use crate::tenant::metadata::save_metadata;
@@ -150,14 +152,6 @@ pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";
pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
/// References to shared objects that are passed into each tenant, such
/// as the shared remote storage client and process initialization state.
#[derive(Clone)]
pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub remote_storage: Option<GenericRemoteStorage>,
}
///
/// Tenant consists of multiple timelines. Keep them in a hash table.
///
@@ -397,7 +391,7 @@ impl Tenant {
async fn timeline_init_and_sync(
&self,
timeline_id: TimelineId,
resources: TimelineResources,
remote_client: Option<RemoteTimelineClient>,
remote_startup_data: Option<RemoteStartupData>,
local_metadata: Option<TimelineMetadata>,
ancestor: Option<Arc<Timeline>>,
@@ -418,57 +412,17 @@ impl Tenant {
timeline_id,
up_to_date_metadata,
ancestor.clone(),
resources,
remote_client,
init_order,
CreateTimelineCause::Load,
)?;
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!(
disk_consistent_lsn.is_valid(),
new_disk_consistent_lsn.is_valid(),
"Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
);
assert_eq!(
disk_consistent_lsn,
up_to_date_metadata.disk_consistent_lsn(),
"these are used interchangeably"
);
// Save the metadata file to local disk.
if !picked_local {
save_metadata(
self.conf,
&tenant_id,
&timeline_id,
up_to_date_metadata,
first_save,
)
.context("save_metadata")?;
}
let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
if let Some(index_part) = index_part {
timeline
.remote_client
.as_ref()
.unwrap()
.init_upload_queue(index_part)?;
} else if self.remote_storage.is_some() {
// No data on the remote storage, but we have local metadata file. We can end up
// here with timeline_create being interrupted before finishing index part upload.
// By doing what we do here, the index part upload is retried.
// If control plane retries timeline creation in the meantime, the mgmt API handler
// for timeline creation will coalesce on the upload we queue here.
let rtc = timeline.remote_client.as_ref().unwrap();
rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
}
timeline
.load_layer_map(
disk_consistent_lsn,
remote_startup_data.map(|x| x.index_part),
)
.load_layer_map(new_disk_consistent_lsn)
.await
.with_context(|| {
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
@@ -492,6 +446,19 @@ impl Tenant {
}
};
if self.remote_storage.is_some() {
// Reconcile local state with remote storage, downloading anything that's
// missing locally, and scheduling uploads for anything that's missing
// in remote storage.
timeline
.reconcile_with_remote(
up_to_date_metadata,
remote_startup_data.as_ref().map(|r| &r.index_part),
)
.await
.context("failed to reconcile with remote")?
}
// Sanity check: a timeline should have some content.
anyhow::ensure!(
ancestor.is_some()
@@ -506,6 +473,18 @@ impl Tenant {
"Timeline has no ancestor and no layer files"
);
// Save the metadata file to local disk.
if !picked_local {
save_metadata(
self.conf,
&tenant_id,
&timeline_id,
up_to_date_metadata,
first_save,
)
.context("save_metadata")?;
}
Ok(())
}
@@ -523,7 +502,6 @@ impl Tenant {
conf: &'static PageServerConf,
tenant_id: TenantId,
broker_client: storage_broker::BrokerClientChannel,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
@@ -538,7 +516,7 @@ impl Tenant {
tenant_conf,
wal_redo_manager,
tenant_id,
Some(remote_storage.clone()),
Some(remote_storage),
));
// Do all the hard work in the background
@@ -553,61 +531,17 @@ impl Tenant {
"attach tenant",
false,
async move {
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
let make_broken = |t: &Tenant, err: anyhow::Error| {
error!("attach failed, setting tenant state to Broken: {err:?}");
t.state.send_modify(|state| {
assert_eq!(
*state,
TenantState::Attaching,
"the attach task owns the tenant state until activation is complete"
);
*state = TenantState::broken_from_reason(err.to_string());
});
};
let pending_deletion = {
match DeleteTenantFlow::should_resume_deletion(
conf,
Some(&remote_storage),
&tenant_clone,
)
.await
{
Ok(should_resume_deletion) => should_resume_deletion,
Err(err) => {
make_broken(&tenant_clone, anyhow::anyhow!(err));
return Ok(());
}
}
};
info!("pending_deletion {}", pending_deletion.is_some());
if let Some(deletion) = pending_deletion {
match DeleteTenantFlow::resume_from_attach(
deletion,
&tenant_clone,
tenants,
&ctx,
)
.await
{
Err(err) => {
make_broken(&tenant_clone, anyhow::anyhow!(err));
return Ok(());
}
Ok(()) => return Ok(()),
}
}
match tenant_clone.attach(&ctx).await {
Ok(()) => {
info!("attach finished, activating");
tenant_clone.activate(broker_client, None, &ctx);
}
Err(e) => {
make_broken(&tenant_clone, anyhow::anyhow!(e));
error!("attach failed, setting tenant state to Broken: {:?}", e);
tenant_clone.state.send_modify(|state| {
assert_eq!(*state, TenantState::Attaching, "the attach task owns the tenant state until activation is complete");
*state = TenantState::broken_from_reason(e.to_string());
});
}
}
Ok(())
@@ -685,9 +619,6 @@ impl Tenant {
.instrument(info_span!("download_index_part", %timeline_id)),
);
}
let mut timelines_to_resume_deletions = vec![];
// Wait for all the download tasks to complete & collect results.
let mut remote_index_and_client = HashMap::new();
let mut timeline_ancestors = HashMap::new();
@@ -698,15 +629,15 @@ impl Tenant {
debug!("successfully downloaded index part for timeline {timeline_id}");
match index_part {
MaybeDeletedIndexPart::IndexPart(index_part) => {
timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
timeline_ancestors.insert(
timeline_id,
index_part.parse_metadata().context("parse_metadata")?,
);
remote_index_and_client.insert(timeline_id, (index_part, client));
}
MaybeDeletedIndexPart::Deleted(index_part) => {
info!(
"timeline {} is deleted, picking to resume deletion",
timeline_id
);
timelines_to_resume_deletions.push((timeline_id, index_part, client));
MaybeDeletedIndexPart::Deleted(_) => {
info!("timeline {} is deleted, skipping", timeline_id);
continue;
}
}
}
@@ -721,41 +652,14 @@ impl Tenant {
.expect("just put it in above");
// TODO again handle early failure
self.load_remote_timeline(
timeline_id,
index_part,
remote_metadata,
TimelineResources {
remote_client: Some(remote_client),
},
ctx,
)
.await
.with_context(|| {
format!(
"failed to load remote timeline {} for tenant {}",
timeline_id, self.tenant_id
)
})?;
}
// Walk through deleted timelines, resume deletion
for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions {
remote_timeline_client
.init_upload_queue_stopped_to_continue_deletion(&index_part)
.context("init queue stopped")
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
DeleteTimelineFlow::resume_deletion(
Arc::clone(self),
timeline_id,
&index_part.metadata,
Some(remote_timeline_client),
None,
)
.await
.context("resume_deletion")
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
self.load_remote_timeline(timeline_id, index_part, remote_metadata, remote_client, ctx)
.await
.with_context(|| {
format!(
"failed to load remote timeline {} for tenant {}",
timeline_id, self.tenant_id
)
})?;
}
std::fs::remove_file(&marker_file)
@@ -763,7 +667,7 @@ impl Tenant {
crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
.context("fsync tenant directory after unlinking attach marker file")?;
crate::failpoint_support::sleep_millis_async!("attach-before-activate");
utils::failpoint_sleep_millis_async!("attach-before-activate");
info!("Done");
@@ -791,7 +695,7 @@ impl Tenant {
timeline_id: TimelineId,
index_part: IndexPart,
remote_metadata: TimelineMetadata,
resources: TimelineResources,
remote_client: RemoteTimelineClient,
ctx: &RequestContext,
) -> anyhow::Result<()> {
span::debug_assert_current_span_has_tenant_id();
@@ -821,7 +725,7 @@ impl Tenant {
self.timeline_init_and_sync(
timeline_id,
resources,
Some(remote_client),
Some(RemoteStartupData {
index_part,
remote_metadata,
@@ -868,7 +772,8 @@ impl Tenant {
pub(crate) fn spawn_load(
conf: &'static PageServerConf,
tenant_id: TenantId,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
init_order: Option<InitializationOrder>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
ctx: &RequestContext,
@@ -883,9 +788,6 @@ impl Tenant {
}
};
let broker_client = resources.broker_client;
let remote_storage = resources.remote_storage;
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
let tenant = Tenant::new(
TenantState::Loading,
@@ -909,7 +811,6 @@ impl Tenant {
"initial tenant load",
false,
async move {
// Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
let make_broken = |t: &Tenant, err: anyhow::Error| {
error!("load failed, setting tenant state to Broken: {err:?}");
t.state.send_modify(|state| {
@@ -957,7 +858,7 @@ impl Tenant {
.as_mut()
.and_then(|x| x.initial_logical_size_attempt.take());
match DeleteTenantFlow::resume_from_load(
match DeleteTenantFlow::resume(
deletion,
&tenant_clone,
init_order.as_ref(),
@@ -979,7 +880,7 @@ impl Tenant {
match tenant_clone.load(init_order.as_ref(), &ctx).await {
Ok(()) => {
debug!("load finished");
debug!("load finished",);
tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
}
@@ -1175,7 +1076,7 @@ impl Tenant {
debug!("loading tenant task");
crate::failpoint_support::sleep_millis_async!("before-loading-tenant");
utils::failpoint_sleep_millis_async!("before-loading-tenant");
// Load in-memory state to reflect the local files on disk
//
@@ -1271,9 +1172,16 @@ impl Tenant {
) -> Result<(), LoadLocalTimelineError> {
span::debug_assert_current_span_has_tenant_id();
let mut resources = self.build_timeline_resources(timeline_id);
let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
RemoteTimelineClient::new(
remote_storage.clone(),
self.conf,
self.tenant_id,
timeline_id,
)
});
let (remote_startup_data, remote_client) = match resources.remote_client {
let (remote_startup_data, remote_client) = match remote_client {
Some(remote_client) => match remote_client.download_index_file().await {
Ok(index_part) => {
let index_part = match index_part {
@@ -1311,7 +1219,10 @@ impl Tenant {
}
};
let remote_metadata = index_part.metadata.clone();
let remote_metadata = index_part
.parse_metadata()
.context("parse_metadata")
.map_err(LoadLocalTimelineError::Load)?;
(
Some(RemoteStartupData {
index_part,
@@ -1358,10 +1269,9 @@ impl Tenant {
return Ok(());
}
(None, resources.remote_client)
(None, remote_client)
}
};
resources.remote_client = remote_client;
let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
@@ -1374,7 +1284,7 @@ impl Tenant {
self.timeline_init_and_sync(
timeline_id,
resources,
remote_client,
remote_startup_data,
Some(local_metadata),
ancestor,
@@ -1863,7 +1773,7 @@ impl Tenant {
// It's mesed up.
// we just ignore the failure to stop
match self.set_stopping(shutdown_progress, false, false).await {
match self.set_stopping(shutdown_progress, false).await {
Ok(()) => {}
Err(SetStoppingError::Broken) => {
// assume that this is acceptable
@@ -1905,18 +1815,15 @@ impl Tenant {
/// This function is not cancel-safe!
///
/// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
/// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
async fn set_stopping(
&self,
progress: completion::Barrier,
allow_transition_from_loading: bool,
allow_transition_from_attaching: bool,
) -> Result<(), SetStoppingError> {
let mut rx = self.state.subscribe();
// cannot stop before we're done activating, so wait out until we're done activating
rx.wait_for(|state| match state {
TenantState::Attaching if allow_transition_from_attaching => true,
TenantState::Activating(_) | TenantState::Attaching => {
info!(
"waiting for {} to turn Active|Broken|Stopping",
@@ -1933,19 +1840,12 @@ impl Tenant {
// we now know we're done activating, let's see whether this task is the winner to transition into Stopping
let mut err = None;
let stopping = self.state.send_if_modified(|current_state| match current_state {
TenantState::Activating(_) => {
unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
}
TenantState::Attaching => {
if !allow_transition_from_attaching {
unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
};
*current_state = TenantState::Stopping { progress };
true
TenantState::Activating(_) | TenantState::Attaching => {
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
}
TenantState::Loading => {
if !allow_transition_from_loading {
unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
};
*current_state = TenantState::Stopping { progress };
true
@@ -2021,8 +1921,7 @@ impl Tenant {
self.set_broken_no_wait(reason)
}
pub(crate) fn set_broken_no_wait(&self, reason: impl Display) {
let reason = reason.to_string();
pub(crate) fn set_broken_no_wait(&self, reason: String) {
self.state.send_modify(|current_state| {
match *current_state {
TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
@@ -2246,7 +2145,7 @@ impl Tenant {
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
ancestor: Option<Arc<Timeline>>,
resources: TimelineResources,
remote_client: Option<RemoteTimelineClient>,
init_order: Option<&InitializationOrder>,
cause: CreateTimelineCause,
) -> anyhow::Result<Arc<Timeline>> {
@@ -2275,7 +2174,7 @@ impl Tenant {
new_timeline_id,
self.tenant_id,
Arc::clone(&self.walredo_mgr),
resources,
remote_client,
pg_version,
initial_logical_size_can_start.cloned(),
initial_logical_size_attempt.cloned().flatten(),
@@ -2517,9 +2416,7 @@ impl Tenant {
.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
.await?;
crate::failpoint_support::sleep_millis_async!(
"gc_iteration_internal_after_getting_gc_timelines"
);
utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
// If there is nothing to GC, we don't want any messages in the INFO log.
if !gc_timelines.is_empty() {
@@ -2923,23 +2820,6 @@ impl Tenant {
Ok(timeline)
}
/// Call this before constructing a timeline, to build its required structures
fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
let remote_client = RemoteTimelineClient::new(
remote_storage.clone(),
self.conf,
self.tenant_id,
timeline_id,
);
Some(remote_client)
} else {
None
};
TimelineResources { remote_client }
}
/// Creates intermediate timeline structure and its files.
///
/// An empty layer map is initialized, and new data and WAL can be imported starting
@@ -2956,17 +2836,25 @@ impl Tenant {
) -> anyhow::Result<UninitializedTimeline> {
let tenant_id = self.tenant_id;
let resources = self.build_timeline_resources(new_timeline_id);
if let Some(remote_client) = &resources.remote_client {
let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
let remote_client = RemoteTimelineClient::new(
remote_storage.clone(),
self.conf,
tenant_id,
new_timeline_id,
);
remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
}
Some(remote_client)
} else {
None
};
let timeline_struct = self
.create_timeline_struct(
new_timeline_id,
new_metadata,
ancestor,
resources,
remote_client,
None,
CreateTimelineCause::Load,
)
@@ -4039,31 +3927,6 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn delta_layer_dumping() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
let layer_map = tline.layers.read().await;
let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
assert!(!level0_deltas.is_empty());
for delta in level0_deltas {
let delta = layer_map.get_from_desc(&delta);
// Ensure we are dumping a delta layer here
let delta = delta.downcast_delta_layer().unwrap();
delta.dump(false, &ctx).await.unwrap();
delta.dump(true, &ctx).await.unwrap();
}
Ok(())
}
#[tokio::test]
async fn corrupt_metadata() -> anyhow::Result<()> {
const TEST_NAME: &str = "corrupt_metadata";
@@ -4101,7 +3964,7 @@ mod tests {
let mut found_error_message = false;
let mut err_source = err.source();
while let Some(source) = err_source {
if source.to_string().contains("metadata checksum mismatch") {
if source.to_string() == "metadata checksum mismatch" {
found_error_message = true;
break;
}

View File

@@ -12,11 +12,14 @@
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
//!
use crate::page_cache::PAGE_SZ;
use crate::tenant::block_io::BlockCursor;
use crate::tenant::block_io::{BlockCursor, BlockReader};
use std::cmp::min;
use std::io::{Error, ErrorKind};
impl<'a> BlockCursor<'a> {
impl<R> BlockCursor<R>
where
R: BlockReader,
{
/// Read a blob into a new buffer.
pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
let mut buf = Vec::new();
@@ -33,7 +36,7 @@ impl<'a> BlockCursor<'a> {
let mut blknum = (offset / PAGE_SZ as u64) as u32;
let mut off = (offset % PAGE_SZ as u64) as usize;
let mut buf = self.read_blk(blknum).await?;
let mut buf = self.read_blk(blknum)?;
// peek at the first byte, to determine if it's a 1- or 4-byte length
let first_len_byte = buf[off];
@@ -49,7 +52,7 @@ impl<'a> BlockCursor<'a> {
// it is split across two pages
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
blknum += 1;
buf = self.read_blk(blknum).await?;
buf = self.read_blk(blknum)?;
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
off = 4 - thislen;
} else {
@@ -70,7 +73,7 @@ impl<'a> BlockCursor<'a> {
if page_remain == 0 {
// continue on next page
blknum += 1;
buf = self.read_blk(blknum).await?;
buf = self.read_blk(blknum)?;
off = 0;
page_remain = PAGE_SZ;
}

View File

@@ -2,14 +2,11 @@
//! Low-level Block-oriented I/O functions
//!
use super::ephemeral_file::EphemeralFile;
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
use crate::virtual_file::VirtualFile;
use bytes::Bytes;
use std::fs::File;
use std::ops::{Deref, DerefMut};
use std::os::unix::fs::FileExt;
use std::sync::atomic::AtomicU64;
/// This is implemented by anything that can read 8 kB (PAGE_SZ)
/// blocks, using the page cache
@@ -17,81 +14,68 @@ use std::os::unix::fs::FileExt;
/// There are currently two implementations: EphemeralFile, and FileBlockReader
/// below.
pub trait BlockReader {
///
/// Read a block. Returns a "lease" object that can be used to
/// access to the contents of the page. (For the page cache, the
/// lease object represents a lock on the buffer.)
///
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
///
/// Create a new "cursor" for reading from this reader.
///
/// A cursor caches the last accessed page, allowing for faster
/// access if the same block is accessed repeatedly.
fn block_cursor(&self) -> BlockCursor<'_>;
fn block_cursor(&self) -> BlockCursor<&Self>
where
Self: Sized,
{
BlockCursor::new(self)
}
}
impl<B> BlockReader for &B
where
B: BlockReader,
{
fn block_cursor(&self) -> BlockCursor<'_> {
(*self).block_cursor()
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
(*self).read_blk(blknum)
}
}
/// Reference to an in-memory copy of an immutable on-disk block.
pub enum BlockLease<'a> {
/// A block accessible for reading
///
/// During builds with `#[cfg(test)]`, this is a proper enum
/// with two variants to support testing code. During normal
/// builds, it just has one variant and is thus a cheap newtype
/// wrapper of [`PageReadGuard`]
pub enum BlockLease {
PageReadGuard(PageReadGuard<'static>),
EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
#[cfg(test)]
Arc(std::sync::Arc<[u8; PAGE_SZ]>),
Rc(std::rc::Rc<[u8; PAGE_SZ]>),
}
impl From<PageReadGuard<'static>> for BlockLease<'static> {
fn from(value: PageReadGuard<'static>) -> BlockLease<'static> {
impl From<PageReadGuard<'static>> for BlockLease {
fn from(value: PageReadGuard<'static>) -> Self {
BlockLease::PageReadGuard(value)
}
}
#[cfg(test)]
impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
BlockLease::Arc(value)
impl From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease {
fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
BlockLease::Rc(value)
}
}
impl<'a> Deref for BlockLease<'a> {
impl Deref for BlockLease {
type Target = [u8; PAGE_SZ];
fn deref(&self) -> &Self::Target {
match self {
BlockLease::PageReadGuard(v) => v.deref(),
BlockLease::EphemeralFileMutableTail(v) => v,
#[cfg(test)]
BlockLease::Arc(v) => v.deref(),
}
}
}
/// Provides the ability to read blocks from different sources,
/// similar to using traits for this purpose.
///
/// Unlike traits, we also support the read function to be async though.
pub(crate) enum BlockReaderRef<'a> {
FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
FileBlockReaderFile(&'a FileBlockReader<std::fs::File>),
EphemeralFile(&'a EphemeralFile),
Adapter(Adapter<&'a DeltaLayerInner>),
#[cfg(test)]
TestDisk(&'a super::disk_btree::tests::TestDisk),
}
impl<'a> BlockReaderRef<'a> {
#[inline(always)]
async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
use BlockReaderRef::*;
match self {
FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
FileBlockReaderFile(r) => r.read_blk(blknum).await,
EphemeralFile(r) => r.read_blk(blknum).await,
Adapter(r) => r.read_blk(blknum).await,
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
BlockLease::Rc(v) => v.deref(),
}
}
}
@@ -113,31 +97,32 @@ impl<'a> BlockReaderRef<'a> {
/// // do stuff with 'buf'
/// ```
///
pub struct BlockCursor<'a> {
reader: BlockReaderRef<'a>,
pub struct BlockCursor<R>
where
R: BlockReader,
{
reader: R,
}
impl<'a> BlockCursor<'a> {
pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
impl<R> BlockCursor<R>
where
R: BlockReader,
{
pub fn new(reader: R) -> Self {
BlockCursor { reader }
}
// Needed by cli
pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
BlockCursor {
reader: BlockReaderRef::FileBlockReaderVirtual(reader),
}
}
/// Read a block.
///
/// Returns a "lease" object that can be used to
/// access to the contents of the page. (For the page cache, the
/// lease object represents a lock on the buffer.)
#[inline(always)]
pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.reader.read_blk(blknum).await
pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.reader.read_blk(blknum)
}
}
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub struct FileId(u64);
fn next_file_id() -> FileId {
FileId(NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed))
}
/// An adapter for reading a (virtual) file using the page cache.
///
@@ -147,7 +132,7 @@ pub struct FileBlockReader<F> {
pub file: F,
/// Unique ID of this file, used as key in the page cache.
file_id: page_cache::FileId,
file_id: FileId,
}
impl<F> FileBlockReader<F>
@@ -155,7 +140,7 @@ where
F: FileExt,
{
pub fn new(file: F) -> Self {
let file_id = page_cache::next_file_id();
let file_id = next_file_id();
FileBlockReader { file_id, file }
}
@@ -165,17 +150,18 @@ where
assert!(buf.len() == PAGE_SZ);
self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
}
/// Read a block.
///
/// Returns a "lease" object that can be used to
/// access to the contents of the page. (For the page cache, the
/// lease object represents a lock on the buffer.)
pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
}
impl<F> BlockReader for FileBlockReader<F>
where
F: FileExt,
{
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
// Look up the right page
let cache = page_cache::get();
loop {
match cache
.read_immutable_buf(self.file_id, blknum)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
@@ -196,18 +182,6 @@ where
}
}
impl BlockReader for FileBlockReader<File> {
fn block_cursor(&self) -> BlockCursor<'_> {
BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
}
}
impl BlockReader for FileBlockReader<VirtualFile> {
fn block_cursor(&self) -> BlockCursor<'_> {
BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
}
}
///
/// Trait for block-oriented output
///

View File

@@ -7,7 +7,6 @@ use anyhow::Context;
use pageserver_api::models::TenantState;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, instrument, warn, Instrument, Span};
use utils::{
@@ -83,8 +82,6 @@ async fn create_remote_delete_mark(
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"mark_upload",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await
.context("mark_upload")?;
@@ -174,8 +171,6 @@ async fn remove_tenant_remote_delete_mark(
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"remove_tenant_remote_delete_mark",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await
.context("remove_tenant_remote_delete_mark")?;
@@ -257,8 +252,6 @@ pub(crate) async fn remote_delete_mark_exists(
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
"fetch_tenant_deletion_mark",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await;
@@ -282,9 +275,8 @@ pub(crate) async fn remote_delete_mark_exists(
/// It is resumable from any step in case a crash/restart occurs.
/// There are three entrypoints to the process:
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there.
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
#[derive(Default)]
pub enum DeleteTenantFlow {
#[default]
@@ -411,7 +403,7 @@ impl DeleteTenantFlow {
}
}
pub(crate) async fn resume_from_load(
pub(crate) async fn resume(
guard: DeletionGuard,
tenant: &Arc<Tenant>,
init_order: Option<&InitializationOrder>,
@@ -421,7 +413,7 @@ impl DeleteTenantFlow {
let (_, progress) = completion::channel();
tenant
.set_stopping(progress, true, false)
.set_stopping(progress, true)
.await
.expect("cant be stopping or broken");
@@ -449,31 +441,6 @@ impl DeleteTenantFlow {
.await
}
pub(crate) async fn resume_from_attach(
guard: DeletionGuard,
tenant: &Arc<Tenant>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
ctx: &RequestContext,
) -> Result<(), DeleteTenantError> {
let (_, progress) = completion::channel();
tenant
.set_stopping(progress, false, true)
.await
.expect("cant be stopping or broken");
tenant.attach(ctx).await.context("attach")?;
Self::background(
guard,
tenant.conf,
tenant.remote_storage.clone(),
tenants,
tenant,
)
.await
}
async fn prepare(
tenants: &tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,

View File

@@ -259,10 +259,9 @@ where
{
let mut stack = Vec::new();
stack.push((self.root_blk, None));
let block_cursor = self.reader.block_cursor();
while let Some((node_blknum, opt_iter)) = stack.pop() {
// Locate the node.
let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;
let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
let node = OnDiskNode::deparse(node_buf.as_ref())?;
let prefix_len = node.prefix_len as usize;
@@ -354,10 +353,8 @@ where
stack.push((self.root_blk, String::new(), 0, 0, 0));
let block_cursor = self.reader.block_cursor();
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
let blk = self.reader.read_blk(self.start_blk + blknum)?;
let buf: &[u8] = blk.as_ref();
let node = OnDiskNode::<L>::deparse(buf)?;
@@ -686,30 +683,27 @@ impl<const L: usize> BuildNode<L> {
}
#[cfg(test)]
pub(crate) mod tests {
mod tests {
use super::*;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
use crate::tenant::block_io::BlockLease;
use rand::Rng;
use std::collections::BTreeMap;
use std::sync::atomic::{AtomicUsize, Ordering};
#[derive(Clone, Default)]
pub(crate) struct TestDisk {
struct TestDisk {
blocks: Vec<Bytes>,
}
impl TestDisk {
fn new() -> Self {
Self::default()
}
pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
let mut buf = [0u8; PAGE_SZ];
buf.copy_from_slice(&self.blocks[blknum as usize]);
Ok(std::sync::Arc::new(buf).into())
}
}
impl BlockReader for TestDisk {
fn block_cursor(&self) -> BlockCursor<'_> {
BlockCursor::new(BlockReaderRef::TestDisk(self))
fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
let mut buf = [0u8; PAGE_SZ];
buf.copy_from_slice(&self.blocks[blknum as usize]);
Ok(std::rc::Rc::new(buf).into())
}
}
impl BlockWriter for &mut TestDisk {

View File

@@ -2,30 +2,54 @@
//! used to keep in-memory layers spilled on disk.
use crate::config::PageServerConf;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockLease, BlockReader};
use crate::virtual_file::VirtualFile;
use once_cell::sync::Lazy;
use std::cmp::min;
use std::collections::HashMap;
use std::fs::OpenOptions;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::os::unix::prelude::FileExt;
use std::path::PathBuf;
use std::sync::atomic::AtomicU64;
use std::sync::{Arc, RwLock};
use tracing::*;
use utils::id::{TenantId, TimelineId};
pub struct EphemeralFile {
page_cache_file_id: page_cache::FileId,
///
/// This is the global cache of file descriptors (File objects).
///
static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
RwLock::new(EphemeralFiles {
next_file_id: FileId(1),
files: HashMap::new(),
})
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct FileId(u64);
impl std::fmt::Display for FileId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
pub struct EphemeralFiles {
next_file_id: FileId,
files: HashMap<FileId, Arc<VirtualFile>>,
}
pub struct EphemeralFile {
file_id: FileId,
_tenant_id: TenantId,
_timeline_id: TimelineId,
file: VirtualFile,
len: u64,
/// An ephemeral file is append-only.
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
/// The other pages, which can no longer be modified, are accessed through the page cache.
mutable_tail: [u8; PAGE_SZ],
file: Arc<VirtualFile>,
pub size: u64,
}
impl EphemeralFile {
@@ -34,183 +58,74 @@ impl EphemeralFile {
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<EphemeralFile, io::Error> {
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
let filename_disambiguator =
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let mut l = EPHEMERAL_FILES.write().unwrap();
let file_id = l.next_file_id;
l.next_file_id = FileId(l.next_file_id.0 + 1);
let filename = conf
.timeline_path(&tenant_id, &timeline_id)
.join(PathBuf::from(format!("ephemeral-{filename_disambiguator}")));
.join(PathBuf::from(format!("ephemeral-{}", file_id)));
let file = VirtualFile::open_with_options(
&filename,
OpenOptions::new().read(true).write(true).create(true),
)?;
let file_rc = Arc::new(file);
l.files.insert(file_id, file_rc.clone());
Ok(EphemeralFile {
page_cache_file_id: page_cache::next_file_id(),
file_id,
_tenant_id: tenant_id,
_timeline_id: timeline_id,
file,
len: 0,
mutable_tail: [0u8; PAGE_SZ],
file: file_rc,
size: 0,
})
}
pub(crate) fn len(&self) -> u64 {
self.len
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
let mut off = 0;
while off < PAGE_SZ {
let n = self
.file
.read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
if n == 0 {
// Reached EOF. Fill the rest of the buffer with zeros.
const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
buf[off..].copy_from_slice(&ZERO_BUF[off..]);
break;
}
off += n;
}
Ok(())
}
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
loop {
match cache
.read_immutable_buf(self.page_cache_file_id, blknum)
.await
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
// order path before error because error is anyhow::Error => might have many contexts
format!(
"ephemeral file: read immutable page #{}: {}: {:#}",
blknum,
self.file.path.display(),
e,
),
)
})? {
page_cache::ReadBufResult::Found(guard) => {
return Ok(BlockLease::PageReadGuard(guard))
}
page_cache::ReadBufResult::NotFound(mut write_guard) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
write_guard.mark_valid();
fn get_buf_for_write(
&self,
blkno: u32,
) -> Result<page_cache::PageWriteGuard<'static>, io::Error> {
// Look up the right page
let cache = page_cache::get();
let mut write_guard = match cache
.write_ephemeral_buf(self.file_id, blkno)
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
{
WriteBufResult::Found(guard) => guard,
WriteBufResult::NotFound(mut guard) => {
// Read the page from disk into the buffer
// TODO: if we're overwriting the whole page, no need to read it in first
self.fill_buffer(guard.deref_mut(), blkno)?;
guard.mark_valid();
// Swap for read lock
continue;
}
};
// And then fall through to modify it.
guard
}
} else {
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
}
}
};
write_guard.mark_dirty();
pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
struct Writer<'a> {
ephemeral_file: &'a mut EphemeralFile,
/// The block to which the next [`push_bytes`] will write.
blknum: u32,
/// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
off: usize,
}
impl<'a> Writer<'a> {
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
Ok(Writer {
blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
ephemeral_file,
})
}
#[inline(always)]
async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
let mut src_remaining = src;
while !src_remaining.is_empty() {
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
let n = min(dst_remaining.len(), src_remaining.len());
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
self.off += n;
src_remaining = &src_remaining[n..];
if self.off == PAGE_SZ {
match self.ephemeral_file.file.write_all_at(
&self.ephemeral_file.mutable_tail,
self.blknum as u64 * PAGE_SZ as u64,
) {
Ok(_) => {
// Pre-warm the page cache with what we just wrote.
// This isn't necessary for coherency/correctness, but it's how we've always done it.
let cache = page_cache::get();
match cache
.read_immutable_buf(
self.ephemeral_file.page_cache_file_id,
self.blknum,
)
.await
{
Ok(page_cache::ReadBufResult::Found(_guard)) => {
// This function takes &mut self, so, it shouldn't be possible to reach this point.
unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
}
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
write_guard.mark_valid();
// pre-warm successful
}
Err(e) => {
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
}
}
// Zero the buffer for re-use.
// Zeroing is critical for correcntess because the write_blob code below
// and similarly read_blk expect zeroed pages.
self.ephemeral_file.mutable_tail.fill(0);
// This block is done, move to next one.
self.blknum += 1;
self.off = 0;
}
Err(e) => {
return Err(std::io::Error::new(
ErrorKind::Other,
// order error before path because path is long and error is short
format!(
"ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
self.blknum,
e,
self.ephemeral_file.file.path.display(),
),
));
}
}
}
}
Ok(())
}
}
let pos = self.len;
let mut writer = Writer::new(self)?;
// Write the length field
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
writer.push_bytes(&len_buf).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
writer.push_bytes(&len_buf).await?;
}
// Write the payload
writer.push_bytes(srcbuf).await?;
if srcbuf.len() < 0x80 {
self.len += 1;
} else {
self.len += 4;
}
self.len += srcbuf.len() as u64;
Ok(pos)
Ok(write_guard)
}
}
@@ -223,10 +138,99 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
}
}
impl BlobWriter for EphemeralFile {
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
struct Writer<'a> {
ephemeral_file: &'a mut EphemeralFile,
/// The block to which the next [`push_bytes`] will write.
blknum: u32,
/// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
off: usize,
/// Used by [`push_bytes`] to memoize the page cache write guard across calls to it.
memo_page_guard: MemoizedPageWriteGuard,
}
struct MemoizedPageWriteGuard {
guard: page_cache::PageWriteGuard<'static>,
/// The block number of the page in `guard`.
blknum: u32,
}
impl<'a> Writer<'a> {
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
let blknum = (ephemeral_file.size / PAGE_SZ as u64) as u32;
Ok(Writer {
blknum,
off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
memo_page_guard: MemoizedPageWriteGuard {
guard: ephemeral_file.get_buf_for_write(blknum)?,
blknum,
},
ephemeral_file,
})
}
#[inline(always)]
fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
// `src_remaining` is the remaining bytes to be written
let mut src_remaining = src;
while !src_remaining.is_empty() {
let page = if self.memo_page_guard.blknum == self.blknum {
&mut self.memo_page_guard.guard
} else {
self.memo_page_guard.guard =
self.ephemeral_file.get_buf_for_write(self.blknum)?;
self.memo_page_guard.blknum = self.blknum;
&mut self.memo_page_guard.guard
};
let dst_remaining = &mut page[self.off..];
let n = min(dst_remaining.len(), src_remaining.len());
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
self.off += n;
src_remaining = &src_remaining[n..];
if self.off == PAGE_SZ {
// This block is done, move to next one.
self.blknum += 1;
self.off = 0;
}
}
Ok(())
}
}
let pos = self.size;
let mut writer = Writer::new(self)?;
// Write the length field
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
writer.push_bytes(&len_buf)?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
writer.push_bytes(&len_buf)?;
}
// Write the payload
writer.push_bytes(srcbuf)?;
if srcbuf.len() < 0x80 {
self.size += 1;
} else {
self.size += 4;
}
self.size += srcbuf.len() as u64;
Ok(pos)
}
}
impl Drop for EphemeralFile {
fn drop(&mut self) {
// There might still be pages in the [`crate::page_cache`] for this file.
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
// drop all pages from page cache
let cache = page_cache::get();
cache.drop_buffers_for_ephemeral(self.file_id);
// remove entry from the hash map
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
// unlink the file
let res = std::fs::remove_file(&self.file.path);
@@ -246,16 +250,59 @@ impl Drop for EphemeralFile {
}
}
impl BlockReader for EphemeralFile {
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
pub fn writeback(file_id: FileId, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
Ok(_) => Ok(()),
Err(e) => Err(io::Error::new(
ErrorKind::Other,
format!(
"failed to write back to ephemeral file at {} error: {}",
file.path.display(),
e
),
)),
}
} else {
Err(io::Error::new(
ErrorKind::Other,
"could not write back page, not found in ephemeral files hash",
))
}
}
impl BlockReader for EphemeralFile {
fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
// Look up the right page
let cache = page_cache::get();
loop {
match cache
.read_ephemeral_buf(self.file_id, blknum)
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
{
ReadBufResult::Found(guard) => return Ok(guard.into()),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
self.fill_buffer(write_guard.deref_mut(), blknum)?;
write_guard.mark_valid();
// Swap for read lock
continue;
}
};
}
}
}
fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::BlockCursor;
use rand::{thread_rng, RngCore};
use std::fs;
use std::str::FromStr;
@@ -283,12 +330,12 @@ mod tests {
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
let pos_foo = file.write_blob(b"foo").await?;
let pos_foo = file.write_blob(b"foo")?;
assert_eq!(
b"foo",
file.block_cursor().read_blob(pos_foo).await?.as_slice()
);
let pos_bar = file.write_blob(b"bar").await?;
let pos_bar = file.write_blob(b"bar")?;
assert_eq!(
b"foo",
file.block_cursor().read_blob(pos_foo).await?.as_slice()
@@ -301,17 +348,17 @@ mod tests {
let mut blobs = Vec::new();
for i in 0..10000 {
let data = Vec::from(format!("blob{}", i).as_bytes());
let pos = file.write_blob(&data).await?;
let pos = file.write_blob(&data)?;
blobs.push((pos, data));
}
// also test with a large blobs
for i in 0..100 {
let data = format!("blob{}", i).as_bytes().repeat(100);
let pos = file.write_blob(&data).await?;
let pos = file.write_blob(&data)?;
blobs.push((pos, data));
}
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
let cursor = BlockCursor::new(&file);
for (pos, expected) in blobs {
let actual = cursor.read_blob(pos).await?;
assert_eq!(actual, expected);
@@ -321,7 +368,7 @@ mod tests {
let mut large_data = Vec::new();
large_data.resize(20000, 0);
thread_rng().fill_bytes(&mut large_data);
let pos_large = file.write_blob(&large_data).await?;
let pos_large = file.write_blob(&large_data)?;
let result = file.block_cursor().read_blob(pos_large).await?;
assert_eq!(result, large_data);

View File

@@ -50,6 +50,7 @@ use crate::context::RequestContext;
use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use crate::tenant::storage_layer::Layer;
use anyhow::Result;
use std::collections::VecDeque;
use std::ops::Range;

View File

@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
use std::io::{self, Write};
use anyhow::{bail, ensure, Context};
use serde::{de::Error, Deserialize, Serialize, Serializer};
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tracing::info_span;
use utils::bin_ser::SerializeError;
@@ -232,28 +232,6 @@ impl TimelineMetadata {
}
}
impl<'de> Deserialize<'de> for TimelineMetadata {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let bytes = Vec::<u8>::deserialize(deserializer)?;
Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
}
}
impl Serialize for TimelineMetadata {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let bytes = self
.to_bytes()
.map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
bytes.serialize(serializer)
}
}
/// Save timeline metadata to file
pub fn save_metadata(
conf: &'static PageServerConf,

View File

@@ -27,9 +27,8 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
use utils::fs_ext::PathExt;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::delete::{remote_delete_mark_exists, DeleteTenantError};
use super::timeline::delete::DeleteTimelineFlow;
use super::TenantSharedResources;
/// The tenants known to the pageserver.
/// The enum variants are used to distinguish the different states that the pageserver can be in.
@@ -67,7 +66,8 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
#[instrument(skip_all)]
pub async fn init_tenant_mgr(
conf: &'static PageServerConf,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
init_order: InitializationOrder,
) -> anyhow::Result<()> {
// Scan local filesystem for attached tenants
@@ -125,7 +125,8 @@ pub async fn init_tenant_mgr(
match schedule_local_tenant_processing(
conf,
&tenant_dir_path,
resources.clone(),
broker_client.clone(),
remote_storage.clone(),
Some(init_order.clone()),
&TENANTS,
&ctx,
@@ -161,7 +162,8 @@ pub async fn init_tenant_mgr(
pub(crate) fn schedule_local_tenant_processing(
conf: &'static PageServerConf,
tenant_path: &Path,
resources: TenantSharedResources,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
init_order: Option<InitializationOrder>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
ctx: &RequestContext,
@@ -198,15 +200,8 @@ pub(crate) fn schedule_local_tenant_processing(
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if let Some(remote_storage) = resources.remote_storage {
match Tenant::spawn_attach(
conf,
tenant_id,
resources.broker_client,
tenants,
remote_storage,
ctx,
) {
if let Some(remote_storage) = remote_storage {
match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -224,7 +219,15 @@ pub(crate) fn schedule_local_tenant_processing(
} else {
info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
// Start loading the tenant into memory. It will initially be in Loading state.
Tenant::spawn_load(conf, tenant_id, resources, init_order, tenants, ctx)
Tenant::spawn_load(
conf,
tenant_id,
broker_client,
remote_storage,
init_order,
tenants,
ctx,
)
};
Ok(tenant)
}
@@ -359,12 +362,8 @@ pub async fn create_tenant(
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let tenant_resources = TenantSharedResources {
broker_client,
remote_storage,
};
let created_tenant =
schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?;
schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233
@@ -523,11 +522,7 @@ pub async fn load_tenant(
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
}
let resources = TenantSharedResources {
broker_client,
remote_storage,
};
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, resources, None, &TENANTS, ctx)
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
.with_context(|| {
format!("Failed to schedule tenant processing in path {tenant_path:?}")
})?;
@@ -596,6 +591,12 @@ pub async fn attach_tenant(
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
// Temporary solution, proper one would be to resume deletion, but that needs more plumbing around Tenant::load/Tenant::attach
// Corresponding issue https://github.com/neondatabase/neon/issues/5006
if remote_delete_mark_exists(conf, &tenant_id, &remote_storage).await? {
return Err(anyhow::anyhow!("Tenant is marked as deleted on remote storage").into());
}
tenant_map_insert(tenant_id, || {
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
// TODO: tenant directory remains on disk if we bail out from here on.
@@ -608,11 +609,7 @@ pub async fn attach_tenant(
.context("check for attach marker file existence")?;
anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
let resources = TenantSharedResources {
broker_client,
remote_storage: Some(remote_storage),
};
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?;
let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
// See https://github.com/neondatabase/neon/issues/4233

View File

@@ -135,7 +135,7 @@
//! - Initiate upload queue with that [`IndexPart`].
//! - Reschedule all lost operations by comparing the local filesystem state
//! and remote state as per [`IndexPart`]. This is done in
//! [`Tenant::timeline_init_and_sync`].
//! [`Tenant::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
//!
//! Note that if we crash during file deletion between the index update
//! that removes the file from the list of files, and deleting the remote file,
@@ -172,6 +172,7 @@
//! transitioning it from `TenantState::Attaching` to `TenantState::Active` state.
//! This starts the timelines' WAL-receivers and the tenant's GC & Compaction loops.
//!
//! Most of the above steps happen in [`Timeline::reconcile_with_remote`] or its callers.
//! We keep track of the fact that a client is in `Attaching` state in a marker
//! file on the local disk. This is critical because, when we restart the pageserver,
//! we do not want to do the `List timelines` step for each tenant that has already
@@ -191,14 +192,14 @@
//! not created and the uploads are skipped.
//! Theoretically, it should be ok to remove and re-add remote storage configuration to
//! the pageserver config at any time, since it doesn't make a difference to
//! [`Timeline::load_layer_map`].
//! `reconcile_with_remote`.
//! Of course, the remote timeline dir must not change while we have de-configured
//! remote storage, i.e., the pageserver must remain the owner of the given prefix
//! in remote storage.
//! But note that we don't test any of this right now.
//!
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
//! [`Timeline::reconcile_with_remote`]: super::Timeline::reconcile_with_remote
mod delete;
mod download;
@@ -210,7 +211,6 @@ use chrono::{NaiveDateTime, Utc};
// re-export these
pub use download::{is_temp_download_file, list_remote_timelines};
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use utils::backoff::{
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
@@ -231,7 +231,6 @@ use crate::metrics::{
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
};
use crate::task_mgr::shutdown_token;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::upload_queue::Delete;
@@ -354,10 +353,6 @@ impl RemoteTimelineClient {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
self.update_remote_physical_size_gauge(Some(index_part));
info!(
"initialized upload queue from remote index with {} layer files",
index_part.layer_metadata.len()
);
Ok(())
}
@@ -370,7 +365,6 @@ impl RemoteTimelineClient {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_empty_remote(local_metadata)?;
self.update_remote_physical_size_gauge(None);
info!("initialized upload queue as empty");
Ok(())
}
@@ -541,7 +535,8 @@ impl RemoteTimelineClient {
// ahead of what's _actually_ on the remote during index upload.
upload_queue.latest_metadata = metadata.clone();
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
self.schedule_index_upload(upload_queue, metadata_bytes);
Ok(())
}
@@ -561,7 +556,8 @@ impl RemoteTimelineClient {
let upload_queue = guard.initialized_mut()?;
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
self.schedule_index_upload(upload_queue, metadata_bytes);
}
Ok(())
@@ -571,7 +567,7 @@ impl RemoteTimelineClient {
fn schedule_index_upload(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
metadata: TimelineMetadata,
metadata_bytes: Vec<u8>,
) {
info!(
"scheduling metadata upload with {} files ({} changed)",
@@ -584,7 +580,7 @@ impl RemoteTimelineClient {
let index_part = IndexPart::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata,
metadata_bytes,
);
let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
self.calls_unfinished_metric_begin(&op);
@@ -640,7 +636,7 @@ impl RemoteTimelineClient {
// Deleting layers doesn't affect the values stored in TimelineMetadata,
// so we don't need update it. Just serialize it.
let metadata = upload_queue.latest_metadata.clone();
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
// Update the remote index file, removing the to-be-deleted files from the index,
// before deleting the actual files.
@@ -651,13 +647,12 @@ impl RemoteTimelineClient {
// to syntactically forbid ? or bail! calls here.
let no_bail_here = || {
for name in names {
if upload_queue.latest_files.remove(name).is_some() {
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
}
upload_queue.latest_files.remove(name);
upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
}
if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
self.schedule_index_upload(upload_queue, metadata);
self.schedule_index_upload(upload_queue, metadata_bytes);
}
// schedule the actual deletions
@@ -759,7 +754,7 @@ impl RemoteTimelineClient {
pausable_failpoint!("persist_deleted_index_part");
backoff::retry(
|| {
|| async {
upload::upload_index_part(
self.conf,
&self.storage_impl,
@@ -767,6 +762,7 @@ impl RemoteTimelineClient {
&self.timeline_id,
&index_part_with_deleted_at,
)
.await
},
|_e| false,
1,
@@ -775,8 +771,6 @@ impl RemoteTimelineClient {
// when executed as part of tenant deletion this happens in the background
2,
"persist_index_part_with_deleted_flag",
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await?;
@@ -863,7 +857,6 @@ impl RemoteTimelineClient {
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"list_prefixes",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("list prefixes")?;
@@ -887,7 +880,6 @@ impl RemoteTimelineClient {
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_objects",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
)
.await
.context("delete_objects")?;
@@ -909,7 +901,6 @@ impl RemoteTimelineClient {
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"delete_index",
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("delete_index")?;
@@ -1075,15 +1066,6 @@ impl RemoteTimelineClient {
.await
}
UploadOp::UploadMetadata(ref index_part, _lsn) => {
let mention_having_future_layers = if cfg!(feature = "testing") {
index_part
.layer_metadata
.keys()
.any(|x| x.is_in_future(*_lsn))
} else {
false
};
let res = upload::upload_index_part(
self.conf,
&self.storage_impl,
@@ -1101,10 +1083,6 @@ impl RemoteTimelineClient {
.await;
if res.is_ok() {
self.update_remote_physical_size_gauge(Some(index_part));
if mention_having_future_layers {
// find rationale near crate::tenant::timeline::init::cleanup_future_layer
tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup");
}
}
res
}
@@ -1156,13 +1134,14 @@ impl RemoteTimelineClient {
}
// sleep until it's time to retry, or we're cancelled
exponential_backoff(
retries,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
&shutdown_token(),
)
.await;
tokio::select! {
_ = task_mgr::shutdown_watcher() => { },
_ = exponential_backoff(
retries,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
) => { },
};
}
}
}
@@ -1599,17 +1578,14 @@ mod tests {
};
assert_file_list(
&index_part
.layer_metadata
.keys()
.map(|f| f.to_owned())
.collect(),
&index_part.timeline_layers,
&[
&layer_file_name_1.file_name(),
&layer_file_name_2.file_name(),
],
);
assert_eq!(index_part.metadata, metadata);
let downloaded_metadata = index_part.parse_metadata().unwrap();
assert_eq!(downloaded_metadata, metadata);
// Schedule upload and then a deletion. Check that the deletion is queued
client

View File

@@ -11,7 +11,6 @@ use std::time::Duration;
use anyhow::{anyhow, Context};
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tokio_util::sync::CancellationToken;
use utils::{backoff, crashsafe};
use crate::config::PageServerConf;
@@ -281,10 +280,6 @@ where
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
description,
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || -> DownloadError {
unreachable!()
}),
)
.await
}

View File

@@ -62,9 +62,10 @@ pub struct IndexPart {
#[serde(skip_serializing_if = "Option::is_none")]
pub deleted_at: Option<NaiveDateTime>,
/// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
#[serde(default, skip_deserializing)]
timeline_layers: HashSet<LayerFileName>,
/// Layer names, which are stored on the remote storage.
///
/// Additional metadata can might exist in `layer_metadata`.
pub timeline_layers: HashSet<LayerFileName>,
/// Per layer file name metadata, which can be present for a present or missing layer file.
///
@@ -73,13 +74,10 @@ pub struct IndexPart {
pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated for convenience when reading the serialized structure, but is
// private because internally we would read from metadata instead.
// It's duplicated here for convenience.
#[serde_as(as = "DisplayFromStr")]
disk_consistent_lsn: Lsn,
#[serde(rename = "metadata_bytes")]
pub metadata: TimelineMetadata,
pub disk_consistent_lsn: Lsn,
metadata_bytes: Vec<u8>,
}
impl IndexPart {
@@ -87,17 +85,13 @@ impl IndexPart {
/// used to understand later versions.
///
/// Version is currently informative only.
/// Version history
/// - 2: added `deleted_at`
/// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
/// is always generated from the keys of `layer_metadata`)
const LATEST_VERSION: usize = 3;
const LATEST_VERSION: usize = 2;
pub const FILE_NAME: &'static str = "index_part.json";
pub fn new(
layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
disk_consistent_lsn: Lsn,
metadata: TimelineMetadata,
metadata_bytes: Vec<u8>,
) -> Self {
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
@@ -113,10 +107,14 @@ impl IndexPart {
timeline_layers,
layer_metadata,
disk_consistent_lsn,
metadata,
metadata_bytes,
deleted_at: None,
}
}
pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
TimelineMetadata::from_bytes(&self.metadata_bytes)
}
}
impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -124,12 +122,12 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let metadata = upload_queue.latest_metadata.clone();
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
Ok(Self::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata,
metadata_bytes,
))
}
}
@@ -168,7 +166,7 @@ mod tests {
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
timeline_layers: HashSet::new(),
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
@@ -180,7 +178,7 @@ mod tests {
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
deleted_at: None,
};
@@ -199,13 +197,13 @@ mod tests {
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
}"#;
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
timeline_layers: HashSet::new(),
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
@@ -217,7 +215,7 @@ mod tests {
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
deleted_at: None,
};
@@ -236,14 +234,14 @@ mod tests {
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"deleted_at": "2023-07-31T09:00:00.123"
}"#;
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 2,
timeline_layers: HashSet::new(),
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
@@ -255,7 +253,7 @@ mod tests {
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
};
@@ -279,7 +277,7 @@ mod tests {
timeline_layers: HashSet::new(),
layer_metadata: HashMap::new(),
disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
metadata: TimelineMetadata::from_bytes(&[
metadata_bytes: [
136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
@@ -300,8 +298,8 @@ mod tests {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0,
])
.unwrap(),
]
.to_vec(),
deleted_at: None,
};

View File

@@ -41,6 +41,8 @@ pub use inmemory_layer::InMemoryLayer;
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use remote_layer::RemoteLayer;
use super::timeline::layer_manager::LayerManager;
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
where
T: PartialOrd<T>,
@@ -173,9 +175,16 @@ impl LayerAccessStats {
///
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
pub(crate) fn for_loading_layer(
layer_map_lock_held_witness: &LayerManager,
status: LayerResidenceStatus,
) -> Self {
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
new.record_residence_event(
layer_map_lock_held_witness,
status,
LayerResidenceEventReason::LayerLoad,
);
new
}
@@ -188,6 +197,7 @@ impl LayerAccessStats {
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn clone_for_residence_change(
&self,
layer_map_lock_held_witness: &LayerManager,
new_status: LayerResidenceStatus,
) -> LayerAccessStats {
let clone = {
@@ -195,7 +205,11 @@ impl LayerAccessStats {
inner.clone()
};
let new = LayerAccessStats(Mutex::new(clone));
new.record_residence_event(new_status, LayerResidenceEventReason::ResidenceChange);
new.record_residence_event(
layer_map_lock_held_witness,
new_status,
LayerResidenceEventReason::ResidenceChange,
);
new
}
@@ -215,6 +229,7 @@ impl LayerAccessStats {
///
pub(crate) fn record_residence_event(
&self,
_layer_map_lock_held_witness: &LayerManager,
status: LayerResidenceStatus,
reason: LayerResidenceEventReason,
) {
@@ -329,6 +344,23 @@ impl LayerAccessStats {
/// are used in (timeline).
#[async_trait::async_trait]
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
/// Range of keys that this layer covers
fn get_key_range(&self) -> Range<Key>;
/// Inclusive start bound of the LSN range that this layer holds
/// Exclusive end bound of the LSN range that this layer holds.
///
/// - For an open in-memory layer, this is MAX_LSN.
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
fn get_lsn_range(&self) -> Range<Lsn>;
/// Does this layer only contain some data for the key-range (incremental),
/// or does it contain a version of every page? This is important to know
/// for garbage collecting old layers: an incremental layer depends on
/// the previous non-incremental layer.
fn is_incremental(&self) -> bool;
///
/// Return data needed to reconstruct given page at LSN.
///
@@ -348,6 +380,9 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
reconstruct_data: &mut ValueReconstructState,
ctx: &RequestContext,
) -> Result<ValueReconstructResult>;
/// Dump summary of the contents of the layer to stdout
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
}
/// Get a layer descriptor from a layer.
@@ -432,6 +467,7 @@ pub mod tests {
TimelineId::from_array([0; 16]),
value.key_range,
value.lsn,
false,
233,
)
}

View File

@@ -51,6 +51,7 @@ use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tokio::runtime::Handle;
use tokio::sync::OnceCell;
use tracing::*;
@@ -176,6 +177,10 @@ impl DeltaKey {
Lsn(u64::from_be_bytes(self.0[KEY_SIZE..].try_into().unwrap()))
}
fn extract_key_from_buf(buf: &[u8]) -> Key {
Key::from_slice(&buf[..KEY_SIZE])
}
fn extract_lsn_from_buf(buf: &[u8]) -> Lsn {
let mut lsn_buf = [0u8; 8];
lsn_buf.copy_from_slice(&buf[KEY_SIZE..]);
@@ -239,54 +244,8 @@ impl std::fmt::Debug for DeltaLayerInner {
#[async_trait::async_trait]
impl Layer for DeltaLayer {
async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
.await
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
impl std::fmt::Display for DeltaLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.layer_desc().short_id())
}
}
impl AsLayerDesc for DeltaLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
}
}
impl PersistentLayer for DeltaLayer {
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
Some(self)
}
fn local_path(&self) -> Option<PathBuf> {
self.local_path()
}
fn delete_resident_layer_file(&self) -> Result<()> {
self.delete_resident_layer_file()
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
self.info(reset)
}
fn access_stats(&self) -> &LayerAccessStats {
self.access_stats()
}
}
impl DeltaLayer {
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
/// debugging function to print out the contents of the layer
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!(
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
self.desc.tenant_id,
@@ -318,11 +277,12 @@ impl DeltaLayer {
tree_reader.dump().await?;
let keys = DeltaLayerInner::load_keys(&inner).await?;
let cursor = file.block_cursor();
// A subroutine to dump a single blob
async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
// TODO this is not ideal, but on the other hand we are in dumping code...
let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
@@ -339,24 +299,31 @@ impl DeltaLayer {
}
};
Ok(desc)
}
};
for entry in keys {
let DeltaEntry { key, lsn, val, .. } = entry;
let desc = match dump_blob(val).await {
Ok(desc) => desc,
Err(err) => {
let err: anyhow::Error = err;
format!("ERROR: {err}")
}
};
println!(" key {key} at {lsn}: {desc}");
}
tree_reader
.visit(
&[0u8; DELTA_KEY_SIZE],
VisitDirection::Forwards,
|delta_key, val| {
let blob_ref = BlobRef(val);
let key = DeltaKey::extract_key_from_buf(delta_key);
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
let desc = match dump_blob(blob_ref) {
Ok(desc) => desc,
Err(err) => format!("ERROR: {}", err),
};
println!(" key {} at {}: {}", key, lsn, desc);
true
},
)
.await?;
Ok(())
}
pub(crate) async fn get_value_reconstruct_data(
async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
@@ -375,19 +342,52 @@ impl DeltaLayer {
.await
}
pub(crate) fn local_path(&self) -> Option<PathBuf> {
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
impl std::fmt::Display for DeltaLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.layer_desc().short_id())
}
}
impl AsLayerDesc for DeltaLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
}
}
impl PersistentLayer for DeltaLayer {
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
Some(self)
}
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
fn delete_resident_layer_file(&self) -> Result<()> {
// delete underlying file
fs::remove_file(self.path())?;
Ok(())
}
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.layer_desc().filename().file_name();
let lsn_range = self.layer_desc().lsn_range.clone();
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
let access_stats = self.access_stats.as_api_model(reset);
@@ -401,10 +401,12 @@ impl DeltaLayer {
}
}
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
fn access_stats(&self) -> &LayerAccessStats {
&self.access_stats
}
}
impl DeltaLayer {
fn path_for(
path_or_conf: &PathOrConf,
tenant_id: &TenantId,
@@ -467,7 +469,7 @@ impl DeltaLayer {
PathOrConf::Path(_) => None,
};
let loaded = DeltaLayerInner::load(&path, summary).await?;
let loaded = DeltaLayerInner::load(&path, summary)?;
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
@@ -550,12 +552,17 @@ impl DeltaLayer {
/// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
///
/// The value can be obtained via the [`ValueRef::load`] function.
pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<DeltaEntry<'_>>> {
pub(crate) async fn load_keys(
&self,
ctx: &RequestContext,
) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
let inner = self
.load(LayerAccessKind::KeyIter, ctx)
.await
.context("load delta layer keys")?;
DeltaLayerInner::load_keys(inner)
let inner = Ref(&**inner);
DeltaLayerInner::load_keys(&inner)
.await
.context("Layer index is corrupted")
}
@@ -841,15 +848,12 @@ impl Drop for DeltaLayerWriter {
}
impl DeltaLayerInner {
pub(super) async fn load(
path: &std::path::Path,
summary: Option<Summary>,
) -> anyhow::Result<Self> {
pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0).await?;
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
@@ -954,14 +958,14 @@ impl DeltaLayerInner {
pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
this: &T,
) -> Result<Vec<DeltaEntry<'_>>> {
) -> Result<Vec<DeltaEntry<T>>> {
let dl = this.as_ref();
let file = &dl.file;
let tree_reader =
DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
tree_reader
.visit(
@@ -971,9 +975,7 @@ impl DeltaLayerInner {
let delta_key = DeltaKey::from_slice(key);
let val_ref = ValueRef {
blob_ref: BlobRef(value),
reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
Adapter(dl),
)),
reader: BlockCursor::new(Adapter(this.clone())),
};
let pos = BlobRef(value).pos();
if let Some(last) = all_keys.last_mut() {
@@ -1002,23 +1004,43 @@ impl DeltaLayerInner {
}
}
/// Cloneable borrow wrapper to make borrows behave like smart pointers.
///
/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
/// cloning DeltaLayerInner.
pub(crate) struct Ref<T>(T);
impl<'a, T> AsRef<T> for Ref<&'a T> {
fn as_ref(&self) -> &T {
self.0
}
}
impl<'a, T> Clone for Ref<&'a T> {
fn clone(&self) -> Self {
*self
}
}
impl<'a, T> Copy for Ref<&'a T> {}
/// A set of data associated with a delta layer key and its value
pub struct DeltaEntry<'a> {
pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
pub key: Key,
pub lsn: Lsn,
/// Size of the stored value
pub size: u64,
/// Reference to the on-disk value
pub val: ValueRef<'a>,
pub val: ValueRef<T>,
}
/// Reference to an on-disk value
pub struct ValueRef<'a> {
pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
blob_ref: BlobRef,
reader: BlockCursor<'a>,
reader: BlockCursor<Adapter<T>>,
}
impl<'a> ValueRef<'a> {
impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
/// Loads the value from disk
pub async fn load(&self) -> Result<Value> {
// theoretically we *could* record an access time for each, but it does not really matter
@@ -1028,10 +1050,10 @@ impl<'a> ValueRef<'a> {
}
}
pub(crate) struct Adapter<T>(T);
struct Adapter<T: AsRef<DeltaLayerInner>>(T);
impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.0.as_ref().file.read_blk(blknum).await
impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.0.as_ref().file.read_blk(blknum)
}
}

View File

@@ -212,20 +212,9 @@ pub enum LayerFileName {
}
impl LayerFileName {
pub(crate) fn file_name(&self) -> String {
pub fn file_name(&self) -> String {
self.to_string()
}
/// Determines if this layer file is considered to be in future meaning we will discard these
/// layers during timeline initialization from the given disk_consistent_lsn.
pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
use LayerFileName::*;
match self {
Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
_ => false,
}
}
}
impl fmt::Display for LayerFileName {
@@ -274,8 +263,8 @@ impl serde::Serialize for LayerFileName {
S: serde::Serializer,
{
match self {
Self::Image(fname) => serializer.collect_str(fname),
Self::Delta(fname) => serializer.collect_str(fname),
Self::Image(fname) => serializer.serialize_str(&fname.to_string()),
Self::Delta(fname) => serializer.serialize_str(&fname.to_string()),
}
}
}

View File

@@ -169,52 +169,8 @@ impl std::fmt::Debug for ImageLayerInner {
#[async_trait::async_trait]
impl Layer for ImageLayer {
/// Look up given page in the file
async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
self.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
.await
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
impl std::fmt::Display for ImageLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.layer_desc().short_id())
}
}
impl AsLayerDesc for ImageLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
}
}
impl PersistentLayer for ImageLayer {
fn local_path(&self) -> Option<PathBuf> {
self.local_path()
}
fn delete_resident_layer_file(&self) -> Result<()> {
self.delete_resident_layer_file()
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
self.info(reset)
}
fn access_stats(&self) -> &LayerAccessStats {
self.access_stats()
}
}
impl ImageLayer {
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
/// debugging function to print out the contents of the layer
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!(
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
self.desc.tenant_id,
@@ -222,7 +178,7 @@ impl ImageLayer {
self.desc.key_range.start,
self.desc.key_range.end,
self.lsn,
self.desc.is_incremental(),
self.desc.is_incremental,
self.desc.file_size
);
@@ -247,7 +203,8 @@ impl ImageLayer {
Ok(())
}
pub(crate) async fn get_value_reconstruct_data(
/// Look up given page in the file
async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
@@ -268,33 +225,65 @@ impl ImageLayer {
.with_context(|| format!("read {}", self.path().display()))
}
pub(crate) fn local_path(&self) -> Option<PathBuf> {
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
impl std::fmt::Display for ImageLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.layer_desc().short_id())
}
}
impl AsLayerDesc for ImageLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
}
}
impl PersistentLayer for ImageLayer {
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
pub(crate) fn delete_resident_layer_file(&self) -> Result<()> {
fn delete_resident_layer_file(&self) -> Result<()> {
// delete underlying file
fs::remove_file(self.path())?;
Ok(())
}
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.layer_desc().filename().file_name();
let lsn_start = self.layer_desc().image_layer_lsn();
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
HistoricLayerInfo::Image {
layer_file_name,
layer_file_size: self.desc.file_size,
lsn_start,
lsn_start: lsn_range.start,
remote: false,
access_stats: self.access_stats.as_api_model(reset),
}
}
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
fn access_stats(&self) -> &LayerAccessStats {
&self.access_stats
}
}
impl ImageLayer {
fn path_for(
path_or_conf: &PathOrConf,
timeline_id: TimelineId,
@@ -349,8 +338,7 @@ impl ImageLayer {
PathOrConf::Path(_) => None,
};
let loaded =
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
@@ -383,6 +371,7 @@ impl ImageLayer {
timeline_id,
filename.key_range.clone(),
filename.lsn,
false,
file_size,
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: filename.lsn,
@@ -409,6 +398,7 @@ impl ImageLayer {
summary.timeline_id,
summary.key_range,
summary.lsn,
false,
metadata.len(),
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: summary.lsn,
@@ -433,7 +423,7 @@ impl ImageLayer {
}
impl ImageLayerInner {
pub(super) async fn load(
pub(super) fn load(
path: &std::path::Path,
lsn: Lsn,
summary: Option<Summary>,
@@ -441,7 +431,7 @@ impl ImageLayerInner {
let file = VirtualFile::open(path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0).await?;
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
@@ -510,6 +500,7 @@ struct ImageLayerWriterInner {
tenant_id: TenantId,
key_range: Range<Key>,
lsn: Lsn,
is_incremental: bool,
blob_writer: WriteBlobWriter<VirtualFile>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -525,6 +516,7 @@ impl ImageLayerWriterInner {
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
is_incremental: bool,
) -> anyhow::Result<Self> {
// Create the file initially with a temporary filename.
// We'll atomically rename it to the final name when we're done.
@@ -559,6 +551,7 @@ impl ImageLayerWriterInner {
lsn,
tree: tree_builder,
blob_writer,
is_incremental,
};
Ok(writer)
@@ -619,6 +612,7 @@ impl ImageLayerWriterInner {
self.timeline_id,
self.key_range.clone(),
self.lsn,
self.is_incremental, // for now, image layer ALWAYS covers the full range
metadata.len(),
);
@@ -693,6 +687,7 @@ impl ImageLayerWriter {
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
is_incremental: bool,
) -> anyhow::Result<ImageLayerWriter> {
Ok(Self {
inner: Some(ImageLayerWriterInner::new(
@@ -701,6 +696,7 @@ impl ImageLayerWriter {
tenant_id,
key_range,
lsn,
is_incremental,
)?),
})
}

View File

@@ -7,12 +7,14 @@
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::repository::{Key, Value};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::BlockReader;
use crate::tenant::ephemeral_file::EphemeralFile;
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
use crate::walrecord;
use anyhow::{ensure, Result};
use pageserver_api::models::InMemoryLayerInfo;
use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::OnceLock;
use tracing::*;
@@ -30,6 +32,12 @@ use tokio::sync::RwLock;
use super::{DeltaLayer, DeltaLayerWriter, Layer};
thread_local! {
/// A buffer for serializing object during [`InMemoryLayer::put_value`].
/// This buffer is reused for each serialization to avoid additional malloc calls.
static SER_BUFFER: RefCell<Vec<u8>> = RefCell::new(Vec::new());
}
pub struct InMemoryLayer {
conf: &'static PageServerConf,
tenant_id: TenantId,
@@ -77,11 +85,11 @@ impl std::fmt::Debug for InMemoryLayerInner {
}
impl InMemoryLayer {
pub(crate) fn get_timeline_id(&self) -> TimelineId {
pub fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
pub(crate) fn info(&self) -> InMemoryLayerInfo {
pub fn info(&self) -> InMemoryLayerInfo {
let lsn_start = self.start_lsn;
if let Some(&lsn_end) = self.end_lsn.get() {
@@ -91,22 +99,32 @@ impl InMemoryLayer {
}
}
pub(crate) fn assert_writable(&self) {
fn assert_writable(&self) {
assert!(self.end_lsn.get().is_none());
}
pub(crate) fn end_lsn_or_max(&self) -> Lsn {
fn end_lsn_or_max(&self) -> Lsn {
self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
}
}
pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
#[async_trait::async_trait]
impl Layer for InMemoryLayer {
fn get_key_range(&self) -> Range<Key> {
Key::MIN..Key::MAX
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.start_lsn..self.end_lsn_or_max()
}
fn is_incremental(&self) -> bool {
// in-memory layer is always considered incremental.
true
}
/// debugging function to print out the contents of the layer
///
/// this is likely completly unused
pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
let inner = self.inner.read().await;
let end_str = self.end_lsn_or_max();
@@ -153,7 +171,7 @@ impl InMemoryLayer {
}
/// Look up given value in the layer.
pub(crate) async fn get_value_reconstruct_data(
async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
@@ -203,20 +221,6 @@ impl InMemoryLayer {
}
}
#[async_trait::async_trait]
impl Layer for InMemoryLayer {
async fn get_value_reconstruct_data(
&self,
key: Key,
lsn_range: Range<Lsn>,
reconstruct_data: &mut ValueReconstructState,
ctx: &RequestContext,
) -> Result<ValueReconstructResult> {
self.get_value_reconstruct_data(key, lsn_range, reconstruct_data, ctx)
.await
}
}
impl std::fmt::Display for InMemoryLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let end_lsn = self.end_lsn_or_max();
@@ -226,11 +230,11 @@ impl std::fmt::Display for InMemoryLayer {
impl InMemoryLayer {
///
/// Get layer size.
/// Get layer size on the disk
///
pub async fn size(&self) -> Result<u64> {
let inner = self.inner.read().await;
Ok(inner.file.len())
Ok(inner.file.size)
}
///
@@ -265,17 +269,17 @@ impl InMemoryLayer {
/// Adds the page version to the in-memory tree
pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let inner: &mut _ = &mut *self.inner.write().await;
let mut inner = self.inner.write().await;
self.assert_writable();
let off = {
// Avoid doing allocations for "small" values.
// In the regression test suite, the limit of 256 avoided allocations in 95% of cases:
// https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
buf.clear();
val.ser_into(&mut buf)?;
inner.file.write_blob(&buf).await?
SER_BUFFER.with(|x| -> Result<_> {
let mut buf = x.borrow_mut();
buf.clear();
val.ser_into(&mut (*buf))?;
let off = inner.file.write_blob(&buf)?;
Ok(off)
})?
};
let vec_map = inner.index.entry(key).or_default();
@@ -313,7 +317,7 @@ impl InMemoryLayer {
/// Write this frozen in-memory layer to disk.
///
/// Returns a new delta layer with all the same data as this in-memory layer
pub(crate) async fn write_to_disk(&self) -> Result<DeltaLayer> {
pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception

View File

@@ -19,17 +19,16 @@ use serde::{Deserialize, Serialize};
pub struct PersistentLayerDesc {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
/// Range of keys that this layer covers
pub key_range: Range<Key>,
/// Inclusive start, exclusive end of the LSN range that this layer holds.
///
/// - For an open in-memory layer, the end bound is MAX_LSN
/// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
/// range start
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
/// For image layer, this is `[lsn, lsn+1)`.
pub lsn_range: Range<Lsn>,
/// Whether this is a delta layer, and also, is this incremental.
/// Whether this is a delta layer.
pub is_delta: bool,
/// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
/// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
/// incremental.
pub is_incremental: bool,
/// File size
pub file_size: u64,
}
@@ -62,6 +61,7 @@ impl PersistentLayerDesc {
key_range,
lsn_range: Lsn(0)..Lsn(1),
is_delta: false,
is_incremental: false,
file_size: 0,
}
}
@@ -71,6 +71,7 @@ impl PersistentLayerDesc {
timeline_id: TimelineId,
key_range: Range<Key>,
lsn: Lsn,
is_incremental: bool,
file_size: u64,
) -> Self {
Self {
@@ -79,6 +80,7 @@ impl PersistentLayerDesc {
key_range,
lsn_range: Self::image_layer_lsn_range(lsn),
is_delta: false,
is_incremental,
file_size,
}
}
@@ -96,6 +98,7 @@ impl PersistentLayerDesc {
key_range,
lsn_range,
is_delta: true,
is_incremental: true,
file_size,
}
}
@@ -161,12 +164,8 @@ impl PersistentLayerDesc {
self.tenant_id
}
/// Does this layer only contain some data for the key-range (incremental),
/// or does it contain a version of every page? This is important to know
/// for garbage collecting old layers: an incremental layer depends on
/// the previous non-incremental layer.
pub fn is_incremental(&self) -> bool {
self.is_delta
self.is_incremental
}
pub fn is_delta(&self) -> bool {
@@ -183,7 +182,7 @@ impl PersistentLayerDesc {
self.lsn_range.start,
self.lsn_range.end,
self.is_delta,
self.is_incremental(),
self.is_incremental,
self.file_size,
);

View File

@@ -60,7 +60,7 @@ impl std::fmt::Debug for RemoteLayer {
f.debug_struct("RemoteLayer")
.field("file_name", &self.desc.filename())
.field("layer_metadata", &self.layer_metadata)
.field("is_incremental", &self.desc.is_incremental())
.field("is_incremental", &self.desc.is_incremental)
.finish()
}
}
@@ -76,6 +76,39 @@ impl Layer for RemoteLayer {
) -> Result<ValueReconstructResult> {
bail!("layer {self} needs to be downloaded");
}
/// debugging function to print out the contents of the layer
async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
println!(
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
self.desc.tenant_id,
self.desc.timeline_id,
self.desc.key_range.start,
self.desc.key_range.end,
self.desc.lsn_range.start,
self.desc.lsn_range.end,
self.desc.is_delta,
self.desc.is_incremental,
self.desc.file_size,
);
Ok(())
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -109,8 +142,8 @@ impl PersistentLayer for RemoteLayer {
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.layer_desc().filename().file_name();
let lsn_range = self.layer_desc().lsn_range.clone();
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
if self.desc.is_delta {
HistoricLayerInfo::Delta {
@@ -151,6 +184,7 @@ impl RemoteLayer {
timelineid,
fname.key_range.clone(),
fname.lsn,
false,
layer_metadata.file_size(),
),
layer_metadata: layer_metadata.clone(),
@@ -183,9 +217,9 @@ impl RemoteLayer {
}
/// Create a Layer struct representing this layer, after it has been downloaded.
pub(crate) fn create_downloaded_layer(
pub fn create_downloaded_layer(
&self,
_layer_map_lock_held_witness: &LayerManager,
layer_map_lock_held_witness: &LayerManager,
conf: &'static PageServerConf,
file_size: u64,
) -> Arc<dyn PersistentLayer> {
@@ -197,8 +231,10 @@ impl RemoteLayer {
self.desc.tenant_id,
&fname,
file_size,
self.access_stats
.clone_for_residence_change(LayerResidenceStatus::Resident),
self.access_stats.clone_for_residence_change(
layer_map_lock_held_witness,
LayerResidenceStatus::Resident,
),
))
} else {
let fname = self.desc.image_file_name();
@@ -208,8 +244,10 @@ impl RemoteLayer {
self.desc.tenant_id,
&fname,
file_size,
self.access_stats
.clone_for_residence_change(LayerResidenceStatus::Resident),
self.access_stats.clone_for_residence_change(
layer_map_lock_held_witness,
LayerResidenceStatus::Resident,
),
))
}
}

View File

@@ -1,6 +1,5 @@
pub mod delete;
mod eviction_task;
mod init;
pub mod layer_manager;
mod logical_size;
pub mod span;
@@ -28,6 +27,7 @@ use utils::id::TenantTimelineId;
use std::cmp::{max, min, Ordering};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::fs;
use std::ops::{Deref, Range};
use std::path::{Path, PathBuf};
use std::pin::pin;
@@ -38,13 +38,15 @@ use std::time::{Duration, Instant, SystemTime};
use crate::context::{
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
};
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
use crate::tenant::storage_layer::{
DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
LayerAccessStats, LayerFileName, RemoteLayer,
};
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
ephemeral_file::is_ephemeral_file,
layer_map::{LayerMap, SearchResult},
metadata::{save_metadata, TimelineMetadata},
par_fsync,
@@ -76,10 +78,11 @@ use utils::{
use crate::page_cache;
use crate::repository::GcResult;
use crate::repository::{Key, Value};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::walredo::WalRedoManager;
use crate::METADATA_FILE_NAME;
use crate::ZERO_PAGE;
use crate::{is_temporary, task_mgr};
use self::delete::DeleteTimelineFlow;
pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -92,7 +95,7 @@ use super::config::TenantConf;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
use super::storage_layer::{
AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStatsReset, PersistentLayerDesc,
AsLayerDesc, DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc,
};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -137,12 +140,6 @@ fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
drop(rlock)
}
/// The outward-facing resources required to build a Timeline
pub struct TimelineResources {
pub remote_client: Option<RemoteTimelineClient>,
}
pub struct Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -465,7 +462,7 @@ impl Timeline {
// The cached image can be returned directly if there is no WAL between the cached image
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
// for redo.
let cached_page_img = match self.lookup_cached_page(&key, lsn).await {
let cached_page_img = match self.lookup_cached_page(&key, lsn) {
Some((cached_lsn, cached_img)) => {
match cached_lsn.cmp(&lsn) {
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
@@ -494,7 +491,6 @@ impl Timeline {
RECONSTRUCT_TIME
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
.await
}
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
@@ -1209,7 +1205,7 @@ impl Timeline {
&layer_metadata,
local_layer
.access_stats()
.clone_for_residence_change(LayerResidenceStatus::Evicted),
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
),
LayerFileName::Delta(delta_name) => RemoteLayer::new_delta(
self.tenant_id,
@@ -1218,7 +1214,7 @@ impl Timeline {
&layer_metadata,
local_layer
.access_stats()
.clone_for_residence_change(LayerResidenceStatus::Evicted),
.clone_for_residence_change(layer_mgr, LayerResidenceStatus::Evicted),
),
});
@@ -1378,7 +1374,7 @@ impl Timeline {
timeline_id: TimelineId,
tenant_id: TenantId,
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
resources: TimelineResources,
remote_client: Option<RemoteTimelineClient>,
pg_version: u32,
initial_logical_size_can_start: Option<completion::Barrier>,
initial_logical_size_attempt: Option<completion::Completion>,
@@ -1413,7 +1409,7 @@ impl Timeline {
walredo_mgr,
walreceiver: Mutex::new(None),
remote_client: resources.remote_client.map(Arc::new),
remote_client: remote_client.map(Arc::new),
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
last_record_lsn: SeqWait::new(RecordLsn {
@@ -1516,7 +1512,7 @@ impl Timeline {
let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
let self_clone = Arc::clone(self);
debug!("spawning flush loop");
info!("spawning flush loop");
*flush_loop_state = FlushLoopState::Running {
#[cfg(test)]
expect_initdb_optimization: false,
@@ -1587,7 +1583,9 @@ impl Timeline {
));
}
///
/// Initialize with an empty layer map. Used when creating a new timeline.
///
pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
let mut layers = self.layers.try_write().expect(
"in the context where we call this function, no other task has access to the object",
@@ -1595,16 +1593,10 @@ impl Timeline {
layers.initialize_empty(Lsn(start_lsn.0));
}
/// Scan the timeline directory, cleanup, populate the layer map, and schedule uploads for local-only
/// files.
pub(super) async fn load_layer_map(
&self,
disk_consistent_lsn: Lsn,
index_part: Option<IndexPart>,
) -> anyhow::Result<()> {
use init::{Decision::*, Discovered, FutureLayer};
use LayerFileName::*;
///
/// Scan the timeline directory to populate the layer map.
///
pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
let mut guard = self.layers.write().await;
let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1612,153 +1604,102 @@ impl Timeline {
// Scan timeline directory and create ImageFileName and DeltaFilename
// structs representing all files on disk
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
let (conf, tenant_id, timeline_id) = (self.conf, self.tenant_id, self.timeline_id);
let span = tracing::Span::current();
// total size of layer files in the current timeline directory
let mut total_physical_size = 0;
let (loaded_layers, to_sync, total_physical_size) = tokio::task::spawn_blocking({
move || {
let _g = span.entered();
let discovered = init::scan_timeline_dir(&timeline_path)?;
let mut discovered_layers = Vec::with_capacity(discovered.len());
let mut unrecognized_files = Vec::new();
let mut loaded_layers = Vec::<Arc<dyn PersistentLayer>>::new();
let mut path = timeline_path;
for direntry in fs::read_dir(timeline_path)? {
let direntry = direntry?;
let direntry_path = direntry.path();
let fname = direntry.file_name();
let fname = fname.to_string_lossy();
for discovered in discovered {
let (name, kind) = match discovered {
Discovered::Layer(file_name, file_size) => {
discovered_layers.push((file_name, file_size));
continue;
}
Discovered::Metadata | Discovered::IgnoredBackup => {
continue;
}
Discovered::Unknown(file_name) => {
// we will later error if there are any
unrecognized_files.push(file_name);
continue;
}
Discovered::Ephemeral(name) => (name, "old ephemeral file"),
Discovered::Temporary(name) => (name, "temporary timeline file"),
Discovered::TemporaryDownload(name) => (name, "temporary download"),
};
path.push(name);
init::cleanup(&path, kind)?;
path.pop();
}
if !unrecognized_files.is_empty() {
// assume that if there are any there are many many.
let n = unrecognized_files.len();
let first = &unrecognized_files[..n.min(10)];
anyhow::bail!(
"unrecognized files in timeline dir (total {n}), first 10: {first:?}"
if let Some(filename) = ImageFileName::parse_str(&fname) {
// create an ImageLayer struct for each image file.
if filename.lsn > disk_consistent_lsn {
info!(
"found future image layer {} on timeline {} disk_consistent_lsn is {}",
filename, self.timeline_id, disk_consistent_lsn
);
rename_to_backup(&direntry_path)?;
continue;
}
let decided =
init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
let file_size = direntry_path.metadata()?.len();
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
let mut loaded_layers = Vec::new();
let mut needs_upload = Vec::new();
let mut needs_cleanup = Vec::new();
let mut total_physical_size = 0;
let layer = ImageLayer::new(
self.conf,
self.timeline_id,
self.tenant_id,
&filename,
file_size,
stats,
);
for (name, decision) in decided {
let decision = match decision {
Ok(UseRemote { local, remote }) => {
path.push(name.file_name());
init::cleanup_local_file_for_remote(&path, &local, &remote)?;
path.pop();
total_physical_size += file_size;
loaded_layers.push(Arc::new(layer));
} else if let Some(filename) = DeltaFileName::parse_str(&fname) {
// Create a DeltaLayer struct for each delta file.
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
if filename.lsn_range.end > disk_consistent_lsn + 1 {
info!(
"found future delta layer {} on timeline {} disk_consistent_lsn is {}",
filename, self.timeline_id, disk_consistent_lsn
);
UseRemote { local, remote }
}
Ok(decision) => decision,
Err(FutureLayer { local }) => {
if local.is_some() {
path.push(name.file_name());
init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
path.pop();
}
needs_cleanup.push(name);
continue;
}
};
match &name {
Delta(d) => assert!(d.lsn_range.end <= disk_consistent_lsn + 1),
Image(i) => assert!(i.lsn <= disk_consistent_lsn),
}
let status = match &decision {
UseLocal(_) | NeedsUpload(_) => LayerResidenceStatus::Resident,
Evicted(_) | UseRemote { .. } => LayerResidenceStatus::Evicted,
};
let stats = LayerAccessStats::for_loading_layer(status);
let layer: Arc<dyn PersistentLayer> = match (name, &decision) {
(Delta(d), UseLocal(m) | NeedsUpload(m)) => {
total_physical_size += m.file_size();
Arc::new(DeltaLayer::new(
conf,
timeline_id,
tenant_id,
&d,
m.file_size(),
stats,
))
}
(Image(i), UseLocal(m) | NeedsUpload(m)) => {
total_physical_size += m.file_size();
Arc::new(ImageLayer::new(
conf,
timeline_id,
tenant_id,
&i,
m.file_size(),
stats,
))
}
(Delta(d), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
RemoteLayer::new_delta(tenant_id, timeline_id, &d, remote, stats),
),
(Image(i), Evicted(remote) | UseRemote { remote, .. }) => Arc::new(
RemoteLayer::new_img(tenant_id, timeline_id, &i, remote, stats),
),
};
if let NeedsUpload(m) = decision {
needs_upload.push((layer.clone(), m));
}
loaded_layers.push(layer);
rename_to_backup(&direntry_path)?;
continue;
}
Ok((
loaded_layers,
(needs_upload, needs_cleanup),
total_physical_size,
))
let file_size = direntry_path.metadata()?.len();
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);
let layer = DeltaLayer::new(
self.conf,
self.timeline_id,
self.tenant_id,
&filename,
file_size,
stats,
);
total_physical_size += file_size;
loaded_layers.push(Arc::new(layer));
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
// ignore these
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
info!(
"skipping temp download file, reconcile_with_remote will resume / clean up: {}",
fname
);
} else if is_ephemeral_file(&fname) {
// Delete any old ephemeral files
trace!("deleting old ephemeral file in timeline dir: {}", fname);
fs::remove_file(&direntry_path)?;
} else if is_temporary(&direntry_path) {
info!("removing temp timeline file at {}", direntry_path.display());
fs::remove_file(&direntry_path).with_context(|| {
format!(
"failed to remove temp download file at {}",
direntry_path.display()
)
})?;
} else {
warn!("unrecognized filename in timeline dir: {}", fname);
}
})
.await
.map_err(anyhow::Error::new)
.and_then(|x| x)?;
}
let num_layers = loaded_layers.len();
guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
if let Some(rtc) = self.remote_client.as_ref() {
let (needs_upload, needs_cleanup) = to_sync;
for (layer, m) in needs_upload {
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
}
rtc.schedule_layer_file_deletion(&needs_cleanup)?;
rtc.schedule_index_upload_for_file_changes()?;
// Tenant::create_timeline will wait for these uploads to happen before returning, or
// on retry.
}
guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);
info!(
"loaded layer map with {} layers at {}, total physical size: {}",
@@ -1769,6 +1710,236 @@ impl Timeline {
.set(total_physical_size);
timer.stop_and_record();
Ok(())
}
async fn create_remote_layers(
&self,
index_part: &IndexPart,
local_layers: HashMap<LayerFileName, Arc<dyn PersistentLayer>>,
up_to_date_disk_consistent_lsn: Lsn,
) -> anyhow::Result<HashMap<LayerFileName, Arc<dyn PersistentLayer>>> {
// Are we missing some files that are present in remote storage?
// Create RemoteLayer instances for them.
let mut local_only_layers = local_layers;
// We're holding a layer map lock for a while but this
// method is only called during init so it's fine.
let mut guard = self.layers.write().await;
let mut corrupted_local_layers = Vec::new();
let mut added_remote_layers = Vec::new();
for remote_layer_name in &index_part.timeline_layers {
let local_layer = local_only_layers.remove(remote_layer_name);
let remote_layer_metadata = index_part
.layer_metadata
.get(remote_layer_name)
.map(LayerFileMetadata::from)
.with_context(|| {
format!(
"No remote layer metadata found for layer {}",
remote_layer_name.file_name()
)
})?;
// Is the local layer's size different from the size stored in the
// remote index file?
// If so, rename_to_backup those files & replace their local layer with
// a RemoteLayer in the layer map so that we re-download them on-demand.
if let Some(local_layer) = local_layer {
let local_layer_path = local_layer
.local_path()
.expect("caller must ensure that local_layers only contains local layers");
ensure!(
local_layer_path.exists(),
"every layer from local_layers must exist on disk: {}",
local_layer_path.display()
);
let remote_size = remote_layer_metadata.file_size();
let metadata = local_layer_path.metadata().with_context(|| {
format!(
"get file size of local layer {}",
local_layer_path.display()
)
})?;
let local_size = metadata.len();
if local_size != remote_size {
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
if let Err(err) = rename_to_backup(&local_layer_path) {
assert!(local_layer_path.exists(), "we would leave the local_layer without a file if this does not hold: {}", local_layer_path.display());
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
} else {
self.metrics.resident_physical_size_gauge.sub(local_size);
corrupted_local_layers.push(local_layer);
// fall-through to adding the remote layer
}
} else {
debug!(
"layer is present locally and file size matches remote, using it: {}",
local_layer_path.display()
);
continue;
}
}
info!(
"remote layer does not exist locally, creating remote layer: {}",
remote_layer_name.file_name()
);
match remote_layer_name {
LayerFileName::Image(imgfilename) => {
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
info!(
"found future image layer {} on timeline {} remote_consistent_lsn is {}",
imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
);
continue;
}
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
let remote_layer = RemoteLayer::new_img(
self.tenant_id,
self.timeline_id,
imgfilename,
&remote_layer_metadata,
stats,
);
let remote_layer = Arc::new(remote_layer);
added_remote_layers.push(remote_layer);
}
LayerFileName::Delta(deltafilename) => {
// Create a RemoteLayer for the delta file.
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
info!(
"found future delta layer {} on timeline {} remote_consistent_lsn is {}",
deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
);
continue;
}
let stats =
LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
let remote_layer = RemoteLayer::new_delta(
self.tenant_id,
self.timeline_id,
deltafilename,
&remote_layer_metadata,
stats,
);
let remote_layer = Arc::new(remote_layer);
added_remote_layers.push(remote_layer);
}
}
}
guard.initialize_remote_layers(corrupted_local_layers, added_remote_layers);
Ok(local_only_layers)
}
/// This function will synchronize local state with what we have in remote storage.
///
/// Steps taken:
/// 1. Initialize upload queue based on `index_part`.
/// 2. Create `RemoteLayer` instances for layers that exist only on the remote.
/// The list of layers on the remote comes from `index_part`.
/// The list of local layers is given by the layer map's `iter_historic_layers()`.
/// So, the layer map must have been loaded already.
/// 3. Schedule upload of local-only layer files (which will then also update the remote
/// IndexPart to include the new layer files).
///
/// Refer to the [`remote_timeline_client`] module comment for more context.
///
/// # TODO
/// May be a bit cleaner to do things based on populated remote client,
/// and then do things based on its upload_queue.latest_files.
#[instrument(skip(self, index_part, up_to_date_metadata))]
pub async fn reconcile_with_remote(
&self,
up_to_date_metadata: &TimelineMetadata,
index_part: Option<&IndexPart>,
) -> anyhow::Result<()> {
info!("starting");
let remote_client = self
.remote_client
.as_ref()
.ok_or_else(|| anyhow!("cannot download without remote storage"))?;
let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
let local_layers = {
let guard = self.layers.read().await;
let layers = guard.layer_map();
layers
.iter_historic_layers()
.map(|l| (l.filename(), guard.get_from_desc(&l)))
.collect::<HashMap<_, _>>()
};
// If no writes happen, new branches do not have any layers, only the metadata file.
let has_local_layers = !local_layers.is_empty();
let local_only_layers = match index_part {
Some(index_part) => {
info!(
"initializing upload queue from remote index with {} layer files",
index_part.timeline_layers.len()
);
remote_client.init_upload_queue(index_part)?;
self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
.await?
}
None => {
info!("initializing upload queue as empty");
remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
local_layers
}
};
if has_local_layers {
// Are there local files that don't exist remotely? Schedule uploads for them.
// Local timeline metadata will get uploaded to remove along witht he layers.
for (layer_name, layer) in &local_only_layers {
// XXX solve this in the type system
let layer_path = layer
.local_path()
.expect("local_only_layers only contains local layers");
let layer_size = layer_path
.metadata()
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
.len();
info!("scheduling {layer_path:?} for upload");
remote_client
.schedule_layer_file_upload(layer_name, &LayerFileMetadata::new(layer_size))?;
}
remote_client.schedule_index_upload_for_file_changes()?;
} else if index_part.is_none() {
// No data on the remote storage, no local layers, local metadata file.
//
// TODO https://github.com/neondatabase/neon/issues/3865
// Currently, console does not wait for the timeline data upload to the remote storage
// and considers the timeline created, expecting other pageserver nodes to work with it.
// Branch metadata upload could get interrupted (e.g pageserver got killed),
// hence any locally existing branch metadata with no remote counterpart should be uploaded,
// otherwise any other pageserver won't see the branch on `attach`.
//
// After the issue gets implemented, pageserver should rather remove the branch,
// since absence on S3 means we did not acknowledge the branch creation and console will have to retry,
// no need to keep the old files.
remote_client.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
} else {
// Local timeline has a metadata file, remote one too, both have no layers to sync.
}
info!("Done");
Ok(())
}
@@ -2265,15 +2436,7 @@ impl Timeline {
)));
}
}
ancestor
.wait_lsn(timeline.ancestor_lsn, ctx)
.await
.with_context(|| {
format!(
"wait for lsn {} on ancestor timeline_id={}",
timeline.ancestor_lsn, ancestor.timeline_id
)
})?;
ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;
timeline_owned = ancestor;
timeline = &*timeline_owned;
@@ -2452,14 +2615,13 @@ impl Timeline {
}
}
async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
let cache = page_cache::get();
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
// We should look at the key to determine if it's a cacheable object
let (lsn, read_guard) = cache
.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)
.await?;
let (lsn, read_guard) =
cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?;
let img = Bytes::from(read_guard.to_vec());
Some((lsn, img))
}
@@ -2684,6 +2846,7 @@ impl Timeline {
if let Some(ref l) = delta_layer_to_add {
// TODO: move access stats, metrics update, etc. into layer manager.
l.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
@@ -2980,6 +3143,7 @@ impl Timeline {
self.tenant_id,
&img_range,
lsn,
false, // image layer always covers the full range
)?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
@@ -3072,6 +3236,7 @@ impl Timeline {
.add(metadata.len());
let l = Arc::new(l);
l.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
@@ -3239,8 +3404,8 @@ impl Timeline {
/// start of level0 files compaction, the on-demand download should be revisited as well.
///
/// [`compact_inner`]: Self::compact_inner
async fn compact_level0_phase1(
self: &Arc<Self>,
fn compact_level0_phase1(
self: Arc<Self>,
_layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
mut stats: CompactLevel0Phase1StatsBuilder,
@@ -3391,7 +3556,7 @@ impl Timeline {
.collect();
for dl in downcast_deltas.iter() {
// TODO: replace this with an await once we fully go async
all_keys.extend(DeltaLayer::load_keys(dl, ctx).await?);
all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
}
// The current stdlib sorting implementation is designed in a way where it is
@@ -3505,103 +3670,107 @@ impl Timeline {
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
for &DeltaEntry {
key, lsn, ref val, ..
} in all_values_iter
{
let value = val.load().await?;
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
// We need to check key boundaries once we reach next key or end of layer with the same key
if !same_key || lsn == dup_end_lsn {
let mut next_key_size = 0u64;
let is_dup_layer = dup_end_lsn.is_valid();
dup_start_lsn = Lsn::INVALID;
if !same_key {
dup_end_lsn = Lsn::INVALID;
// TODO remove this block_on wrapper once we fully go async
Handle::current().block_on(async {
for &DeltaEntry {
key, lsn, ref val, ..
} in all_values_iter
{
let value = val.load().await?;
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
// We need to check key boundaries once we reach next key or end of layer with the same key
if !same_key || lsn == dup_end_lsn {
let mut next_key_size = 0u64;
let is_dup_layer = dup_end_lsn.is_valid();
dup_start_lsn = Lsn::INVALID;
if !same_key {
dup_end_lsn = Lsn::INVALID;
}
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
next_key_size = next_size;
if key != next_key {
if dup_end_lsn.is_valid() {
// We are writting segment with duplicates:
// place all remaining values of this key in separate segment
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
}
break;
}
key_values_total_size += next_size;
// Check if it is time to split segment: if total keys size is larger than target file size.
// We need to avoid generation of empty segments if next_size > target_file_size.
if key_values_total_size > target_file_size && lsn != next_lsn {
// Split key between multiple layers: such layer can contain only single key
dup_start_lsn = if dup_end_lsn.is_valid() {
dup_end_lsn // new segment with duplicates starts where old one stops
} else {
lsn // start with the first LSN for this key
};
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
break;
}
}
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
dup_start_lsn = dup_end_lsn;
dup_end_lsn = lsn_range.end;
}
if writer.is_some() {
let written_size = writer.as_mut().unwrap().size();
let contains_hole =
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
// check if key cause layer overflow or contains hole...
if is_dup_layer
|| dup_end_lsn.is_valid()
|| written_size + key_values_total_size > target_file_size
|| contains_hole
{
// ... if so, flush previous layer and prepare to write new one
new_layers.push(Arc::new(
writer.take().unwrap().finish(prev_key.unwrap().next())?,
));
writer = None;
if contains_hole {
// skip hole
next_hole += 1;
}
}
}
// Remember size of key value because at next iteration we will access next item
key_values_total_size = next_key_size;
}
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
next_key_size = next_size;
if key != next_key {
if writer.is_none() {
// Create writer if not initiaized yet
writer = Some(DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_id,
key,
if dup_end_lsn.is_valid() {
// We are writting segment with duplicates:
// place all remaining values of this key in separate segment
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
}
break;
}
key_values_total_size += next_size;
// Check if it is time to split segment: if total keys size is larger than target file size.
// We need to avoid generation of empty segments if next_size > target_file_size.
if key_values_total_size > target_file_size && lsn != next_lsn {
// Split key between multiple layers: such layer can contain only single key
dup_start_lsn = if dup_end_lsn.is_valid() {
dup_end_lsn // new segment with duplicates starts where old one stops
// this is a layer containing slice of values of the same key
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
dup_start_lsn..dup_end_lsn
} else {
lsn // start with the first LSN for this key
};
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
break;
}
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
lsn_range.clone()
},
)?);
}
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
dup_start_lsn = dup_end_lsn;
dup_end_lsn = lsn_range.end;
}
if writer.is_some() {
let written_size = writer.as_mut().unwrap().size();
let contains_hole =
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
// check if key cause layer overflow or contains hole...
if is_dup_layer
|| dup_end_lsn.is_valid()
|| written_size + key_values_total_size > target_file_size
|| contains_hole
{
// ... if so, flush previous layer and prepare to write new one
new_layers.push(Arc::new(
writer.take().unwrap().finish(prev_key.unwrap().next())?,
));
writer = None;
if contains_hole {
// skip hole
next_hole += 1;
}
}
}
// Remember size of key value because at next iteration we will access next item
key_values_total_size = next_key_size;
fail_point!("delta-layer-writer-fail-before-finish", |_| {
Result::<_>::Err(anyhow::anyhow!(
"failpoint delta-layer-writer-fail-before-finish"
))
});
writer.as_mut().unwrap().put_value(key, lsn, value)?;
prev_key = Some(key);
}
if writer.is_none() {
// Create writer if not initiaized yet
writer = Some(DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_id,
key,
if dup_end_lsn.is_valid() {
// this is a layer containing slice of values of the same key
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
dup_start_lsn..dup_end_lsn
} else {
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
lsn_range.clone()
},
)?);
}
fail_point!("delta-layer-writer-fail-before-finish", |_| {
Err(CompactionError::Other(anyhow::anyhow!(
"failpoint delta-layer-writer-fail-before-finish"
)))
});
writer.as_mut().unwrap().put_value(key, lsn, value)?;
prev_key = Some(key);
}
Ok(())
})?;
if let Some(writer) = writer {
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
}
@@ -3614,10 +3783,10 @@ impl Timeline {
// we still might easily hit the limit otherwise.
let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
for layer in new_layers.iter() {
if layer.layer_desc().file_size > warn_limit {
if layer.desc.file_size > warn_limit {
warn!(
%layer,
"created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size
"created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
);
}
}
@@ -3635,7 +3804,7 @@ impl Timeline {
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
stats.new_deltas_count = Some(new_layers.len());
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum());
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());
match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
.and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
@@ -3675,7 +3844,8 @@ impl Timeline {
deltas_to_compact,
} = {
let phase1_span = info_span!("compact_level0_phase1");
let ctx = ctx.attached_child();
let myself = Arc::clone(self);
let ctx = ctx.attached_child(); // technically, the spawn_blocking can outlive this future
let mut stats = CompactLevel0Phase1StatsBuilder {
version: Some(2),
tenant_id: Some(self.tenant_id),
@@ -3689,15 +3859,18 @@ impl Timeline {
stats.read_lock_acquisition_micros =
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
let layer_removal_cs = layer_removal_cs.clone();
self.compact_level0_phase1(
layer_removal_cs,
phase1_layers_locked,
stats,
target_file_size,
&ctx,
)
.instrument(phase1_span)
.await?
tokio::task::spawn_blocking(move || {
let _entered = phase1_span.enter();
myself.compact_level0_phase1(
layer_removal_cs,
phase1_layers_locked,
stats,
target_file_size,
&ctx,
)
})
.await
.context("spawn_blocking")??
};
if new_layers.is_empty() && deltas_to_compact.is_empty() {
@@ -3751,6 +3924,7 @@ impl Timeline {
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
l.access_stats().record_residence_event(
&guard,
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
@@ -4141,7 +4315,7 @@ impl Timeline {
///
/// Reconstruct a value, using the given base image and WAL records in 'data'.
///
async fn reconstruct_value(
fn reconstruct_value(
&self,
key: Key,
request_lsn: Lsn,
@@ -4210,7 +4384,6 @@ impl Timeline {
last_rec_lsn,
&img,
)
.await
.context("Materialized page memoization failed")
{
return Err(PageReconstructError::from(e));
@@ -4670,8 +4843,7 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
for i in 0u32.. {
new_path.set_file_name(format!("{filename}.{i}.old"));
if !new_path.exists() {
std::fs::rename(path, &new_path)
.with_context(|| format!("rename {path:?} to {new_path:?}"))?;
std::fs::rename(path, &new_path)?;
return Ok(());
}
}

View File

@@ -25,7 +25,7 @@ use crate::{
InitializationOrder,
};
use super::{Timeline, TimelineResources};
use super::Timeline;
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
@@ -416,7 +416,7 @@ impl DeleteTimelineFlow {
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
TimelineResources { remote_client },
remote_client,
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.

View File

@@ -1,199 +0,0 @@
use crate::{
is_temporary,
tenant::{
ephemeral_file::is_ephemeral_file,
remote_timeline_client::{
self,
index::{IndexPart, LayerFileMetadata},
},
storage_layer::LayerFileName,
},
METADATA_FILE_NAME,
};
use anyhow::Context;
use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
use utils::lsn::Lsn;
/// Identified files in the timeline directory.
pub(super) enum Discovered {
/// The only one we care about
Layer(LayerFileName, u64),
/// Old ephmeral files from previous launches, should be removed
Ephemeral(OsString),
/// Old temporary timeline files, unsure what these really are, should be removed
Temporary(OsString),
/// Temporary on-demand download files, should be removed
TemporaryDownload(OsString),
/// "metadata" file we persist locally and include in `index_part.json`
Metadata,
/// Backup file from previously future layers
IgnoredBackup,
/// Unrecognized, warn about these
Unknown(OsString),
}
/// Scans the timeline directory for interesting files.
pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
let mut ret = Vec::new();
for direntry in std::fs::read_dir(path)? {
let direntry = direntry?;
let direntry_path = direntry.path();
let file_name = direntry.file_name();
let fname = file_name.to_string_lossy();
let discovered = match LayerFileName::from_str(&fname) {
Ok(file_name) => {
let file_size = direntry.metadata()?.len();
Discovered::Layer(file_name, file_size)
}
Err(_) => {
if fname == METADATA_FILE_NAME {
Discovered::Metadata
} else if fname.ends_with(".old") {
// ignore these
Discovered::IgnoredBackup
} else if remote_timeline_client::is_temp_download_file(&direntry_path) {
Discovered::TemporaryDownload(file_name)
} else if is_ephemeral_file(&fname) {
Discovered::Ephemeral(file_name)
} else if is_temporary(&direntry_path) {
Discovered::Temporary(file_name)
} else {
Discovered::Unknown(file_name)
}
}
};
ret.push(discovered);
}
Ok(ret)
}
/// Decision on what to do with a layer file after considering its local and remote metadata.
#[derive(Clone)]
pub(super) enum Decision {
/// The layer is not present locally.
Evicted(LayerFileMetadata),
/// The layer is present locally, but local metadata does not match remote; we must
/// delete it and treat it as evicted.
UseRemote {
local: LayerFileMetadata,
remote: LayerFileMetadata,
},
/// The layer is present locally, and metadata matches.
UseLocal(LayerFileMetadata),
/// The layer is only known locally, it needs to be uploaded.
NeedsUpload(LayerFileMetadata),
}
/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
#[derive(Debug)]
pub(super) struct FutureLayer {
/// The local metadata. `None` if the layer is only known through [`IndexPart`].
pub(super) local: Option<LayerFileMetadata>,
}
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
///
/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
/// the checks earlier to [`scan_timeline_dir`].
pub(super) fn reconcile(
discovered: Vec<(LayerFileName, u64)>,
index_part: Option<&IndexPart>,
disk_consistent_lsn: Lsn,
) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
use Decision::*;
// name => (local, remote)
type Collected = HashMap<LayerFileName, (Option<LayerFileMetadata>, Option<LayerFileMetadata>)>;
let mut discovered = discovered
.into_iter()
.map(|(name, file_size)| (name, (Some(LayerFileMetadata::new(file_size)), None)))
.collect::<Collected>();
// merge any index_part information, when available
index_part
.as_ref()
.map(|ip| ip.layer_metadata.iter())
.into_iter()
.flatten()
.map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
.for_each(|(name, metadata)| {
if let Some(existing) = discovered.get_mut(name) {
existing.1 = Some(metadata);
} else {
discovered.insert(name.to_owned(), (None, Some(metadata)));
}
});
discovered
.into_iter()
.map(|(name, (local, remote))| {
let decision = if name.is_in_future(disk_consistent_lsn) {
Err(FutureLayer { local })
} else {
Ok(match (local, remote) {
(Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
(Some(x), Some(_)) => UseLocal(x),
(None, Some(x)) => Evicted(x),
(Some(x), None) => NeedsUpload(x),
(None, None) => {
unreachable!("there must not be any non-local non-remote files")
}
})
};
(name, decision)
})
.collect::<Vec<_>>()
}
pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
let file_name = path.file_name().expect("must be file path");
tracing::debug!(kind, ?file_name, "cleaning up");
std::fs::remove_file(path)
.with_context(|| format!("failed to remove {kind} at {}", path.display()))
}
pub(super) fn cleanup_local_file_for_remote(
path: &Path,
local: &LayerFileMetadata,
remote: &LayerFileMetadata,
) -> anyhow::Result<()> {
let local_size = local.file_size();
let remote_size = remote.file_size();
let file_name = path.file_name().expect("must be file path");
tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
assert!(
path.exists(),
"we would leave the local_layer without a file if this does not hold: {}",
path.display()
);
Err(err)
} else {
Ok(())
}
}
pub(super) fn cleanup_future_layer(
path: &Path,
name: &LayerFileName,
disk_consistent_lsn: Lsn,
) -> anyhow::Result<()> {
use LayerFileName::*;
let kind = match name {
Delta(_) => "delta",
Image(_) => "image",
};
// future image layers are allowed to be produced always for not yet flushed to disk
// lsns stored in InMemoryLayer.
tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
crate::tenant::timeline::rename_to_backup(path)?;
Ok(())
}

View File

@@ -12,38 +12,38 @@ use crate::{
tenant::{
layer_map::{BatchedUpdates, LayerMap},
storage_layer::{
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, PersistentLayer,
PersistentLayerDesc, PersistentLayerKey,
AsLayerDesc, DeltaLayer, ImageLayer, InMemoryLayer, Layer, PersistentLayer,
PersistentLayerDesc, PersistentLayerKey, RemoteLayer,
},
timeline::compare_arced_layers,
},
};
/// Provides semantic APIs to manipulate the layer map.
pub(crate) struct LayerManager {
pub struct LayerManager {
layer_map: LayerMap,
layer_fmgr: LayerFileManager,
}
/// After GC, the layer map changes will not be applied immediately. Users should manually apply the changes after
/// scheduling deletes in remote client.
pub(crate) struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
pub struct ApplyGcResultGuard<'a>(BatchedUpdates<'a>);
impl ApplyGcResultGuard<'_> {
pub(crate) fn flush(self) {
pub fn flush(self) {
self.0.flush();
}
}
impl LayerManager {
pub(crate) fn create() -> Self {
pub fn create() -> Self {
Self {
layer_map: LayerMap::default(),
layer_fmgr: LayerFileManager::new(),
}
}
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
self.layer_fmgr.get_from_desc(desc)
}
@@ -51,12 +51,18 @@ impl LayerManager {
///
/// We expect users only to be able to get an immutable layer map. If users want to make modifications,
/// they should use the below semantic APIs. This design makes us step closer to immutable storage state.
pub(crate) fn layer_map(&self) -> &LayerMap {
pub fn layer_map(&self) -> &LayerMap {
&self.layer_map
}
/// Get a mutable reference to the layer map. This function will be removed once `flush_frozen_layer`
/// gets a refactor.
pub fn layer_map_mut(&mut self) -> &mut LayerMap {
&mut self.layer_map
}
/// Replace layers in the layer file manager, used in evictions and layer downloads.
pub(crate) fn replace_and_verify(
pub fn replace_and_verify(
&mut self,
expected: Arc<dyn PersistentLayer>,
new: Arc<dyn PersistentLayer>,
@@ -67,7 +73,7 @@ impl LayerManager {
/// Called from `load_layer_map`. Initialize the layer manager with:
/// 1. all on-disk layers
/// 2. next open layer (with disk disk_consistent_lsn LSN)
pub(crate) fn initialize_local_layers(
pub fn initialize_local_layers(
&mut self,
on_disk_layers: Vec<Arc<dyn PersistentLayer>>,
next_open_layer_at: Lsn,
@@ -81,13 +87,28 @@ impl LayerManager {
}
/// Initialize when creating a new timeline, called in `init_empty_layer_map`.
pub(crate) fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
pub fn initialize_empty(&mut self, next_open_layer_at: Lsn) {
self.layer_map.next_open_layer_at = Some(next_open_layer_at);
}
pub fn initialize_remote_layers(
&mut self,
corrupted_local_layers: Vec<Arc<dyn PersistentLayer>>,
remote_layers: Vec<Arc<RemoteLayer>>,
) {
let mut updates = self.layer_map.batch_update();
for layer in corrupted_local_layers {
Self::remove_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
}
for layer in remote_layers {
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
}
updates.flush();
}
/// Open a new writable layer to append data if there is no open layer, otherwise return the current open layer,
/// called within `get_layer_for_write`.
pub(crate) fn get_layer_for_write(
pub fn get_layer_for_write(
&mut self,
lsn: Lsn,
last_record_lsn: Lsn,
@@ -142,7 +163,7 @@ impl LayerManager {
}
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
pub(crate) async fn try_freeze_in_memory_layer(
pub async fn try_freeze_in_memory_layer(
&mut self,
Lsn(last_record_lsn): Lsn,
last_freeze_at: &AtomicLsn,
@@ -164,7 +185,7 @@ impl LayerManager {
}
/// Add image layers to the layer map, called from `create_image_layers`.
pub(crate) fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
pub fn track_new_image_layers(&mut self, image_layers: Vec<ImageLayer>) {
let mut updates = self.layer_map.batch_update();
for layer in image_layers {
Self::insert_historic_layer(Arc::new(layer), &mut updates, &mut self.layer_fmgr);
@@ -173,7 +194,7 @@ impl LayerManager {
}
/// Flush a frozen layer and add the written delta layer to the layer map.
pub(crate) fn finish_flush_l0_layer(
pub fn finish_flush_l0_layer(
&mut self,
delta_layer: Option<DeltaLayer>,
frozen_layer_for_check: &Arc<InMemoryLayer>,
@@ -193,7 +214,7 @@ impl LayerManager {
}
/// Called when compaction is completed.
pub(crate) fn finish_compact_l0(
pub fn finish_compact_l0(
&mut self,
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
compact_from: Vec<Arc<dyn PersistentLayer>>,
@@ -221,7 +242,7 @@ impl LayerManager {
}
/// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
pub(crate) fn finish_gc_timeline(
pub fn finish_gc_timeline(
&mut self,
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
gc_layers: Vec<Arc<dyn PersistentLayer>>,
@@ -250,6 +271,16 @@ impl LayerManager {
mapping.insert(layer);
}
/// Helper function to remove a layer into the layer map and file manager
fn remove_historic_layer(
layer: Arc<dyn PersistentLayer>,
updates: &mut BatchedUpdates<'_>,
mapping: &mut LayerFileManager,
) {
updates.remove_historic(layer.layer_desc());
mapping.remove(layer);
}
/// Removes the layer from local FS (if present) and from memory.
/// Remote storage is not affected by this operation.
fn delete_historic_layer(
@@ -282,7 +313,7 @@ impl LayerManager {
}
}
pub(crate) struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
HashMap<PersistentLayerKey, Arc<T>>,
);

View File

@@ -17,7 +17,7 @@ use crate::metrics::{
WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
};
use crate::task_mgr::{shutdown_token, TaskKind};
use crate::task_mgr::TaskKind;
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
@@ -31,11 +31,10 @@ use storage_broker::Streaming;
use tokio::select;
use tracing::*;
use postgres_connection::PgConnectionConfig;
use postgres_connection::{parse_host_port, PgConnectionConfig};
use utils::backoff::{
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
};
use utils::postgres_client::wal_stream_connection_config;
use utils::{
id::{NodeId, TenantTimelineId},
lsn::Lsn,
@@ -212,14 +211,11 @@ async fn subscribe_for_timeline_updates(
id: TenantTimelineId,
) -> Streaming<SafekeeperTimelineInfo> {
let mut attempt = 0;
let cancel = shutdown_token();
loop {
exponential_backoff(
attempt,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
&cancel,
)
.await;
attempt += 1;
@@ -880,6 +876,33 @@ impl ReconnectReason {
}
}
fn wal_stream_connection_config(
TenantTimelineId {
tenant_id,
timeline_id,
}: TenantTimelineId,
listen_pg_addr_str: &str,
auth_token: Option<&str>,
availability_zone: Option<&str>,
) -> anyhow::Result<PgConnectionConfig> {
let (host, port) =
parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
let port = port.unwrap_or(5432);
let mut connstr = PgConnectionConfig::new_host_port(host, port)
.extend_options([
"-c".to_owned(),
format!("timeline_id={}", timeline_id),
format!("tenant_id={}", tenant_id),
])
.set_password(auth_token.map(|s| s.to_owned()));
if let Some(availability_zone) = availability_zone {
connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
}
Ok(connstr)
}
#[cfg(test)]
mod tests {
use super::*;
@@ -895,7 +918,6 @@ mod tests {
timeline: SafekeeperTimelineInfo {
safekeeper_id: 0,
tenant_timeline_id: None,
term: 0,
last_log_term: 0,
flush_lsn: 0,
commit_lsn,
@@ -904,7 +926,6 @@ mod tests {
peer_horizon_lsn: 0,
local_start_lsn: 0,
safekeeper_connstr: safekeeper_connstr.to_owned(),
http_connstr: safekeeper_connstr.to_owned(),
availability_zone: None,
},
latest_update,

View File

@@ -140,24 +140,36 @@ impl UploadQueue {
}
}
let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
for (layer_name, layer_metadata) in &index_part.layer_metadata {
files.insert(
layer_name.to_owned(),
LayerFileMetadata::from(layer_metadata),
);
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
for layer_name in &index_part.timeline_layers {
match index_part
.layer_metadata
.get(layer_name)
.map(LayerFileMetadata::from)
{
Some(layer_metadata) => {
files.insert(layer_name.to_owned(), layer_metadata);
}
None => {
anyhow::bail!(
"No remote layer metadata found for layer {}",
layer_name.file_name()
);
}
}
}
let index_part_metadata = index_part.parse_metadata()?;
info!(
"initializing upload queue with remote index_part.disk_consistent_lsn: {}",
index_part.metadata.disk_consistent_lsn()
index_part_metadata.disk_consistent_lsn()
);
let state = UploadQueueInitialized {
latest_files: files,
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: index_part.metadata.clone(),
last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
latest_metadata: index_part_metadata.clone(),
last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
// what follows are boring default initializations
task_counter: 0,
num_inprogress_layer_uploads: 0,

View File

@@ -312,7 +312,7 @@ impl<'a> WalIngest<'a> {
// particular point in the WAL. For more fine-grained control,
// we could peek into the message and only pause if it contains
// a particular string, for example, but this is enough for now.
crate::failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
}
}

View File

@@ -64,13 +64,13 @@ pub struct EndpointConnPool {
total_conns: usize,
}
/// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway.
/// This is cheap and not hugely secure.
/// But probably good enough for in memory only hashes.
///
/// Still takes 1.4ms to hash on my hardware.
/// Still takes 3.5ms to hash on my hardware.
/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
const PARAMS: Params = Params {
rounds: 4096,
rounds: 10_000,
output_length: 32,
};
@@ -99,10 +99,6 @@ pub struct GlobalConnPool {
max_conns_per_endpoint: usize,
proxy_config: &'static crate::config::ProxyConfig,
// Using a lock to remove any race conditions.
// Eg cleaning up connections while a new connection is returned
closed: RwLock<bool>,
}
impl GlobalConnPool {
@@ -112,24 +108,9 @@ impl GlobalConnPool {
global_pool_size: AtomicUsize::new(0),
max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
proxy_config: config,
closed: RwLock::new(false),
})
}
pub fn shutdown(&self) {
*self.closed.write() = true;
self.global_pool.retain(|_, endpoint_pool| {
let mut pool = endpoint_pool.write();
// by clearing this hashmap, we remove the slots that a connection can be returned to.
// when returning, it drops the connection if the slot doesn't exist
pool.pools.clear();
pool.total_conns = 0;
false
});
}
pub async fn get(
&self,
conn_info: &ConnInfo,
@@ -227,20 +208,7 @@ impl GlobalConnPool {
new_client
}
pub fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
// We want to hold this open while we return. This ensures that the pool can't close
// while we are in the middle of returning the connection.
let closed = self.closed.read();
if *closed {
info!("pool: throwing away connection '{conn_info}' because pool is closed");
return Ok(());
}
if client.inner.is_closed() {
info!("pool: throwing away connection '{conn_info}' because connection is closed");
return Ok(());
}
pub async fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
// return connection to the pool
@@ -408,9 +376,9 @@ async fn connect_to_compute_once(
let (tx, mut rx) = tokio::sync::watch::channel(session);
let conn_id = uuid::Uuid::new_v4();
let span = info_span!(parent: None, "connection", %conn_id);
let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
span.in_scope(|| {
info!(%conn_info, %session, "new connection");
info!(%session, "new connection");
});
tokio::spawn(
@@ -420,28 +388,26 @@ async fn connect_to_compute_once(
info!(%session, "changed session");
}
loop {
let message = ready!(connection.poll_message(cx));
let message = ready!(connection.poll_message(cx));
match message {
Some(Ok(AsyncMessage::Notice(notice))) => {
info!(%session, "notice: {}", notice);
}
Some(Ok(AsyncMessage::Notification(notif))) => {
warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
}
Some(Ok(_)) => {
warn!(%session, "unknown message");
}
Some(Err(e)) => {
error!(%session, "connection error: {}", e);
return Poll::Ready(())
}
None => {
info!("connection closed");
return Poll::Ready(())
}
match message {
Some(Ok(AsyncMessage::Notice(notice))) => {
info!(%session, "notice: {}", notice);
Poll::Pending
}
Some(Ok(AsyncMessage::Notification(notif))) => {
warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
Poll::Pending
}
Some(Ok(_)) => {
warn!(%session, "unknown message");
Poll::Pending
}
Some(Err(e)) => {
error!(%session, "connection error: {}", e);
Poll::Ready(())
}
None => Poll::Ready(()),
}
})
.instrument(span)

View File

@@ -16,6 +16,7 @@ use tokio_postgres::types::Type;
use tokio_postgres::GenericClient;
use tokio_postgres::IsolationLevel;
use tokio_postgres::Row;
use tracing::Instrument;
use url::Url;
use super::conn_pool::ConnInfo;
@@ -285,12 +286,13 @@ pub async fn handle(
};
if allow_pool {
let current_span = tracing::Span::current();
// return connection to the pool
tokio::task::spawn_blocking(move || {
let _span = current_span.enter();
let _ = conn_pool.put(&conn_info, client);
});
tokio::task::spawn(
async move {
let _ = conn_pool.put(&conn_info, client).await;
}
.in_current_span(),
);
}
result

View File

@@ -269,18 +269,6 @@ pub async fn task_main(
let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
// shutdown the connection pool
tokio::spawn({
let cancellation_token = cancellation_token.clone();
let conn_pool = conn_pool.clone();
async move {
cancellation_token.cancelled().await;
tokio::task::spawn_blocking(move || conn_pool.shutdown())
.await
.unwrap();
}
});
let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
Some(config) => config.into(),

View File

@@ -341,35 +341,21 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
// Load all timelines from disk to memory.
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
// Keep handles to main tasks to die if any of them disappears.
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
FuturesUnordered::new();
// Start wal backup launcher before loading timelines as we'll notify it
// through the channel about timelines which need offloading, not draining
// the channel would cause deadlock.
let current_thread_rt = conf
.current_thread_runtime
.then(|| Handle::try_current().expect("no runtime in main"));
let conf_ = conf.clone();
let wal_backup_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
.spawn(wal_backup::wal_backup_launcher_task_main(
conf_,
wal_backup_launcher_rx,
))
.map(|res| ("WAL backup launcher".to_owned(), res));
tasks_handles.push(Box::pin(wal_backup_handle));
// Load all timelines from disk to memory.
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
let conf_ = conf.clone();
// Run everything in current thread rt, if asked.
if conf.current_thread_runtime {
info!("running in current thread runtime");
}
let current_thread_rt = conf
.current_thread_runtime
.then(|| Handle::try_current().expect("no runtime in main"));
let wal_service_handle = current_thread_rt
.as_ref()
@@ -422,6 +408,17 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
.map(|res| ("WAL remover".to_owned(), res));
tasks_handles.push(Box::pin(wal_remover_handle));
let conf_ = conf.clone();
let wal_backup_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
.spawn(wal_backup::wal_backup_launcher_task_main(
conf_,
wal_backup_launcher_rx,
))
.map(|res| ("WAL backup launcher".to_owned(), res));
tasks_handles.push(Box::pin(wal_backup_handle));
set_build_info_metric(GIT_VERSION);
// TODO: update tokio-stream, convert to real async Stream with

View File

@@ -1,6 +1,7 @@
//! Code to deal with safekeeper control file upgrades
use crate::safekeeper::{
AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermLsn,
AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory,
TermSwitchEntry,
};
use anyhow::{bail, Result};
use pq_proto::SystemId;
@@ -144,7 +145,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
let oldstate = SafeKeeperStateV1::des(&buf[..buf.len()])?;
let ac = AcceptorState {
term: oldstate.acceptor_state.term,
term_history: TermHistory(vec![TermLsn {
term_history: TermHistory(vec![TermSwitchEntry {
term: oldstate.acceptor_state.epoch,
lsn: Lsn(0),
}]),

View File

@@ -15,11 +15,8 @@ use tokio::fs::File;
use tokio::io::AsyncReadExt;
use utils::http::endpoint::request_span;
use crate::receive_wal::WalReceiverState;
use crate::safekeeper::ServerInfo;
use crate::safekeeper::Term;
use crate::send_wal::WalSenderState;
use crate::timeline::PeerInfo;
use crate::{debug_dump, pull_timeline};
use crate::timelines_global_map::TimelineDeleteForceResult;
@@ -102,9 +99,6 @@ pub struct TimelineStatus {
pub peer_horizon_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
pub peers: Vec<PeerInfo>,
pub walsenders: Vec<WalSenderState>,
pub walreceivers: Vec<WalReceiverState>,
}
fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
@@ -142,7 +136,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
term_history,
};
let conf = get_conf(&request);
// Note: we report in memory values which can be lost.
let status = TimelineStatus {
tenant_id: ttid.tenant_id,
@@ -156,9 +149,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
backup_lsn: inmem.backup_lsn,
peer_horizon_lsn: inmem.peer_horizon_lsn,
remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
peers: tli.get_peers(conf).await,
walsenders: tli.get_walsenders().get_all(),
walreceivers: tli.get_walreceivers().get_all(),
};
json_response(StatusCode::OK, status)
}
@@ -286,14 +276,12 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
tenant_id: ttid.tenant_id.as_ref().to_owned(),
timeline_id: ttid.timeline_id.as_ref().to_owned(),
}),
term: sk_info.term.unwrap_or(0),
last_log_term: sk_info.last_log_term.unwrap_or(0),
flush_lsn: sk_info.flush_lsn.0,
commit_lsn: sk_info.commit_lsn.0,
remote_consistent_lsn: sk_info.remote_consistent_lsn.0,
peer_horizon_lsn: sk_info.peer_horizon_lsn.0,
safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
http_connstr: sk_info.http_connstr.unwrap_or_else(|| "".to_owned()),
backup_lsn: sk_info.backup_lsn.0,
local_start_lsn: sk_info.local_start_lsn.0,
availability_zone: None,

View File

@@ -21,7 +21,7 @@ use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
use crate::safekeeper::{
AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected,
};
use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermLsn};
use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry};
use crate::timeline::Timeline;
use crate::GlobalTimelines;
use postgres_backend::PostgresBackend;
@@ -119,7 +119,7 @@ async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> any
let history = tli.get_state().await.1.acceptor_state.term_history;
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
let mut history_entries = history.0;
history_entries.push(TermLsn { term, lsn });
history_entries.push(TermSwitchEntry { term, lsn });
let history = TermHistory(history_entries);
let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {

View File

@@ -19,7 +19,6 @@ pub mod json_ctrl;
pub mod metrics;
pub mod pull_timeline;
pub mod receive_wal;
pub mod recovery;
pub mod remove_wal;
pub mod safekeeper;
pub mod send_wal;

View File

@@ -227,9 +227,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
tokio::fs::rename(tli_dir_path, &timeline_path).await?;
let tli = GlobalTimelines::load_timeline(ttid)
.await
.context("Failed to load timeline after copy")?;
let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?;
info!(
"Loaded timeline {}, flush_lsn={}",

View File

@@ -11,16 +11,11 @@ use crate::wal_service::ConnectionId;
use crate::GlobalTimelines;
use anyhow::{anyhow, Context};
use bytes::BytesMut;
use parking_lot::MappedMutexGuard;
use parking_lot::Mutex;
use parking_lot::MutexGuard;
use postgres_backend::CopyStreamHandlerEnd;
use postgres_backend::PostgresBackend;
use postgres_backend::PostgresBackendReader;
use postgres_backend::QueryError;
use pq_proto::BeMessage;
use serde::Deserialize;
use serde::Serialize;
use std::net::SocketAddr;
use std::sync::Arc;
use tokio::io::AsyncRead;
@@ -37,105 +32,6 @@ use tracing::*;
use utils::id::TenantTimelineId;
use utils::lsn::Lsn;
/// Registry of WalReceivers (compute connections). Timeline holds it (wrapped
/// in Arc).
pub struct WalReceivers {
mutex: Mutex<WalReceiversShared>,
}
/// Id under which walreceiver is registered in shmem.
type WalReceiverId = usize;
impl WalReceivers {
pub fn new() -> Arc<WalReceivers> {
Arc::new(WalReceivers {
mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
})
}
/// Register new walreceiver. Returned guard provides access to the slot and
/// automatically deregisters in Drop.
pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
let slots = &mut self.mutex.lock().slots;
let walreceiver = WalReceiverState::Voting;
// find empty slot or create new one
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
slots[pos] = Some(walreceiver);
pos
} else {
let pos = slots.len();
slots.push(Some(walreceiver));
pos
};
WalReceiverGuard {
id: pos,
walreceivers: self.clone(),
}
}
/// Get reference to locked slot contents. Slot must exist (registered
/// earlier).
fn get_slot<'a>(
self: &'a Arc<WalReceivers>,
id: WalReceiverId,
) -> MappedMutexGuard<'a, WalReceiverState> {
MutexGuard::map(self.mutex.lock(), |locked| {
locked.slots[id]
.as_mut()
.expect("walreceiver doesn't exist")
})
}
/// Get number of walreceivers (compute connections).
pub fn get_num(self: &Arc<WalReceivers>) -> usize {
self.mutex.lock().slots.iter().flatten().count()
}
/// Get state of all walreceivers.
pub fn get_all(self: &Arc<WalReceivers>) -> Vec<WalReceiverState> {
self.mutex.lock().slots.iter().flatten().cloned().collect()
}
/// Unregister walsender.
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
let mut shared = self.mutex.lock();
shared.slots[id] = None;
}
}
/// Only a few connections are expected (normally one), so store in Vec.
struct WalReceiversShared {
slots: Vec<Option<WalReceiverState>>,
}
/// Walreceiver status. Currently only whether it passed voting stage and
/// started receiving the stream, but it is easy to add more if needed.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum WalReceiverState {
Voting,
Streaming,
}
/// Scope guard to access slot in WalSenders registry and unregister from it in
/// Drop.
pub struct WalReceiverGuard {
id: WalReceiverId,
walreceivers: Arc<WalReceivers>,
}
impl WalReceiverGuard {
/// Get reference to locked shared state contents.
fn get(&self) -> MappedMutexGuard<WalReceiverState> {
self.walreceivers.get_slot(self.id)
}
}
impl Drop for WalReceiverGuard {
fn drop(&mut self) {
self.walreceivers.unregister(self.id);
}
}
const MSG_QUEUE_SIZE: usize = 256;
const REPLY_QUEUE_SIZE: usize = 16;
@@ -350,13 +246,10 @@ impl WalAcceptor {
/// it must mean that network thread terminated.
async fn run(&mut self) -> anyhow::Result<()> {
// Register the connection and defer unregister.
// Order of the next two lines is important: we want first to remove our entry and then
// update status which depends on registered connections.
let _compute_conn_guard = ComputeConnectionGuard {
self.tli.on_compute_connect().await?;
let _guard = ComputeConnectionGuard {
timeline: Arc::clone(&self.tli),
};
let walreceiver_guard = self.tli.get_walreceivers().register();
self.tli.update_status_notify().await?;
// After this timestamp we will stop processing AppendRequests and send a response
// to the walproposer. walproposer sends at least one AppendRequest per second,
@@ -370,11 +263,6 @@ impl WalAcceptor {
}
let mut next_msg = opt_msg.unwrap();
// Update walreceiver state in shmem for reporting.
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
*walreceiver_guard.get() = WalReceiverState::Streaming;
}
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
// loop through AppendRequest's while it's readily available to
// write as many WAL as possible without fsyncing
@@ -423,7 +311,6 @@ impl WalAcceptor {
}
}
/// Calls update_status_notify in drop to update timeline status.
struct ComputeConnectionGuard {
timeline: Arc<Timeline>,
}
@@ -431,9 +318,11 @@ struct ComputeConnectionGuard {
impl Drop for ComputeConnectionGuard {
fn drop(&mut self) {
let tli = self.timeline.clone();
// tokio forbids to call blocking_send inside the runtime, and see
// comments in on_compute_disconnect why we call blocking_send.
tokio::spawn(async move {
if let Err(e) = tli.update_status_notify().await {
error!("failed to update timeline status: {}", e);
if let Err(e) = tli.on_compute_disconnect().await {
error!("failed to unregister compute connection: {}", e);
}
});
}

View File

@@ -1,40 +0,0 @@
//! This module implements pulling WAL from peer safekeepers if compute can't
//! provide it, i.e. safekeeper lags too much.
use std::sync::Arc;
use tokio::{select, time::sleep, time::Duration};
use tracing::{info, instrument};
use crate::{timeline::Timeline, SafeKeeperConf};
/// Entrypoint for per timeline task which always runs, checking whether
/// recovery for this safekeeper is needed and starting it if so.
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
info!("started");
let mut cancellation_rx = match tli.get_cancellation_rx() {
Ok(rx) => rx,
Err(_) => {
info!("timeline canceled during task start");
return;
}
};
select! {
_ = recovery_main_loop(tli) => { unreachable!() }
_ = cancellation_rx.changed() => {
info!("stopped");
}
}
}
const CHECK_INTERVAL_MS: u64 = 2000;
/// Check regularly whether we need to start recovery.
async fn recovery_main_loop(_tli: Arc<Timeline>) {
let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
loop {
sleep(check_duration).await;
}
}

View File

@@ -34,33 +34,22 @@ pub const UNKNOWN_SERVER_VERSION: u32 = 0;
/// Consensus logical timestamp.
pub type Term = u64;
pub const INVALID_TERM: Term = 0;
const INVALID_TERM: Term = 0;
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct TermLsn {
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct TermSwitchEntry {
pub term: Term,
pub lsn: Lsn,
}
// Creation from tuple provides less typing (e.g. for unit tests).
impl From<(Term, Lsn)> for TermLsn {
fn from(pair: (Term, Lsn)) -> TermLsn {
TermLsn {
term: pair.0,
lsn: pair.1,
}
}
}
#[derive(Clone, Serialize, Deserialize)]
pub struct TermHistory(pub Vec<TermLsn>);
pub struct TermHistory(pub Vec<TermSwitchEntry>);
impl TermHistory {
pub fn empty() -> TermHistory {
TermHistory(Vec::new())
}
// Parse TermHistory as n_entries followed by TermLsn pairs
// Parse TermHistory as n_entries followed by TermSwitchEntry pairs
pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
if bytes.remaining() < 4 {
bail!("TermHistory misses len");
@@ -71,7 +60,7 @@ impl TermHistory {
if bytes.remaining() < 16 {
bail!("TermHistory is incomplete");
}
res.push(TermLsn {
res.push(TermSwitchEntry {
term: bytes.get_u64_le(),
lsn: bytes.get_u64_le().into(),
})
@@ -568,17 +557,12 @@ where
.up_to(self.flush_lsn())
}
/// Get current term.
pub fn get_term(&self) -> Term {
self.state.acceptor_state.term
}
pub fn get_epoch(&self) -> Term {
self.state.acceptor_state.get_epoch(self.flush_lsn())
}
/// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
pub fn flush_lsn(&self) -> Lsn {
fn flush_lsn(&self) -> Lsn {
max(self.wal_store.flush_lsn(), self.state.timeline_start_lsn)
}
@@ -1154,7 +1138,7 @@ mod tests {
let pem = ProposerElected {
term: 1,
start_streaming_at: Lsn(1),
term_history: TermHistory(vec![TermLsn {
term_history: TermHistory(vec![TermSwitchEntry {
term: 1,
lsn: Lsn(3),
}]),

View File

@@ -2,12 +2,12 @@
//! with the "START_REPLICATION" message, and registry of walsenders.
use crate::handler::SafekeeperPostgresHandler;
use crate::safekeeper::{Term, TermLsn};
use crate::safekeeper::Term;
use crate::timeline::Timeline;
use crate::wal_service::ConnectionId;
use crate::wal_storage::WalReader;
use crate::GlobalTimelines;
use anyhow::{bail, Context as AnyhowContext};
use anyhow::Context as AnyhowContext;
use bytes::Bytes;
use parking_lot::Mutex;
use postgres_backend::PostgresBackend;
@@ -390,25 +390,26 @@ impl SafekeeperPostgresHandler {
self.appname.clone(),
));
// Walsender can operate in one of two modes which we select by
// application_name: give only committed WAL (used by pageserver) or all
// existing WAL (up to flush_lsn, used by walproposer or peer recovery).
// The second case is always driven by a consensus leader which term
// must generally be also supplied. However we're sloppy to do this in
// walproposer recovery which will be removed soon. So TODO is to make
// it not Option'al then.
let commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx();
// Walproposer gets special handling: safekeeper must give proposer all
// local WAL till the end, whether committed or not (walproposer will
// hang otherwise). That's because walproposer runs the consensus and
// synchronizes safekeepers on the most advanced one.
//
// Fetching WAL without term in recovery creates a small risk of this
// WAL getting concurrently garbaged if another compute rises which
// collects majority and starts fixing log on this safekeeper itself.
// That's ok as (old) proposer will never be able to commit such WAL.
let end_watch = if self.is_walproposer_recovery() {
EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
// There is a small risk of this WAL getting concurrently garbaged if
// another compute rises which collects majority and starts fixing log
// on this safekeeper itself. That's ok as (old) proposer will never be
// able to commit such WAL.
let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
let wal_end = tli.get_flush_lsn().await;
Some(wal_end)
} else {
EndWatch::Commit(tli.get_commit_lsn_watch_rx())
None
};
// we don't check term here; it will be checked on first waiting/WAL reading anyway.
let end_pos = end_watch.get();
// take the latest commit_lsn if don't have stop_pos
let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
if end_pos < start_pos {
warn!(
@@ -418,10 +419,8 @@ impl SafekeeperPostgresHandler {
}
info!(
"starting streaming from {:?}, available WAL ends at {}, recovery={}",
start_pos,
end_pos,
matches!(end_watch, EndWatch::Flush(_))
"starting streaming from {:?} till {:?}, available WAL ends at {}",
start_pos, stop_pos, end_pos
);
// switch to copy
@@ -446,8 +445,9 @@ impl SafekeeperPostgresHandler {
appname,
start_pos,
end_pos,
stop_pos,
term,
end_watch,
commit_lsn_watch_rx,
ws_guard: ws_guard.clone(),
wal_reader,
send_buf: [0; MAX_SEND_SIZE],
@@ -466,32 +466,6 @@ impl SafekeeperPostgresHandler {
}
}
/// Walsender streams either up to commit_lsn (normally) or flush_lsn in the
/// given term (recovery by walproposer or peer safekeeper).
enum EndWatch {
Commit(Receiver<Lsn>),
Flush(Receiver<TermLsn>),
}
impl EndWatch {
/// Get current end of WAL.
fn get(&self) -> Lsn {
match self {
EndWatch::Commit(r) => *r.borrow(),
EndWatch::Flush(r) => r.borrow().lsn,
}
}
/// Wait for the update.
async fn changed(&mut self) -> anyhow::Result<()> {
match self {
EndWatch::Commit(r) => r.changed().await?,
EndWatch::Flush(r) => r.changed().await?,
}
Ok(())
}
}
/// A half driving sending WAL.
struct WalSender<'a, IO> {
pgb: &'a mut PostgresBackend<IO>,
@@ -506,12 +480,14 @@ struct WalSender<'a, IO> {
// We send this LSN to the receiver as wal_end, so that it knows how much
// WAL this safekeeper has. This LSN should be as fresh as possible.
end_pos: Lsn,
// If present, terminate after reaching this position; used by walproposer
// in recovery.
stop_pos: Option<Lsn>,
/// When streaming uncommitted part, the term the client acts as the leader
/// in. Streaming is stopped if local term changes to a different (higher)
/// value.
term: Option<Term>,
/// Watch channel receiver to learn end of available WAL (and wait for its advancement).
end_watch: EndWatch,
commit_lsn_watch_rx: Receiver<Lsn>,
ws_guard: Arc<WalSenderGuard>,
wal_reader: WalReader,
// buffer for readling WAL into to send it
@@ -521,20 +497,29 @@ struct WalSender<'a, IO> {
impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
/// Send WAL until
/// - an error occurs
/// - receiver is caughtup and there is no computes (if streaming up to commit_lsn)
/// - if we are streaming to walproposer, we've streamed until stop_pos
/// (recovery finished)
/// - receiver is caughtup and there is no computes
///
/// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
/// convenience.
async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
loop {
// Wait for the next portion if it is not there yet, or just
// update our end of WAL available for sending value, we
// communicate it to the receiver.
self.wait_wal().await?;
assert!(
self.end_pos > self.start_pos,
"nothing to send after waiting for WAL"
);
// If we are streaming to walproposer, check it is time to stop.
if let Some(stop_pos) = self.stop_pos {
if self.start_pos >= stop_pos {
// recovery finished
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
"ending streaming to walproposer at {}, recovery finished",
self.start_pos
)));
}
} else {
// Wait for the next portion if it is not there yet, or just
// update our end of WAL available for sending value, we
// communicate it to the receiver.
self.wait_wal().await?;
}
// try to send as much as available, capped by MAX_SEND_SIZE
let mut send_size = self
@@ -582,7 +567,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
/// exit in the meanwhile
async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> {
loop {
self.end_pos = self.end_watch.get();
self.end_pos = *self.commit_lsn_watch_rx.borrow();
if self.end_pos > self.start_pos {
// We have something to send.
trace!("got end_pos {:?}, streaming", self.end_pos);
@@ -590,31 +575,27 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
}
// Wait for WAL to appear, now self.end_pos == self.start_pos.
if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? {
if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
self.end_pos = lsn;
trace!("got end_pos {:?}, streaming", self.end_pos);
return Ok(());
}
// Timed out waiting for WAL, check for termination and send KA.
// Check for termination only if we are streaming up to commit_lsn
// (to pageserver).
if let EndWatch::Commit(_) = self.end_watch {
if let Some(remote_consistent_lsn) = self
.ws_guard
.walsenders
.get_ws_remote_consistent_lsn(self.ws_guard.id)
{
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
// Terminate if there is nothing more to send.
// Note that "ending streaming" part of the string is used by
// pageserver to identify WalReceiverError::SuccessfulCompletion,
// do not change this string without updating pageserver.
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
// Timed out waiting for WAL, check for termination and send KA
if let Some(remote_consistent_lsn) = self
.ws_guard
.walsenders
.get_ws_remote_consistent_lsn(self.ws_guard.id)
{
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
// Terminate if there is nothing more to send.
// Note that "ending streaming" part of the string is used by
// pageserver to identify WalReceiverError::SuccessfulCompletion,
// do not change this string without updating pageserver.
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
self.appname, self.start_pos,
)));
}
}
}
@@ -682,32 +663,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
/// Wait until we have available WAL > start_pos or timeout expires. Returns
/// - Ok(Some(end_pos)) if needed lsn is successfully observed;
/// Wait until we have commit_lsn > lsn or timeout expires. Returns
/// - Ok(Some(commit_lsn)) if needed lsn is successfully observed;
/// - Ok(None) if timeout expired;
/// - Err in case of error -- only if 1) term changed while fetching in recovery
/// mode 2) watch channel closed, which must never happen.
async fn wait_for_lsn(
rx: &mut EndWatch,
client_term: Option<Term>,
start_pos: Lsn,
) -> anyhow::Result<Option<Lsn>> {
/// - Err in case of error (if watch channel is in trouble, shouldn't happen).
async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
let res = timeout(POLL_STATE_TIMEOUT, async move {
let mut commit_lsn;
loop {
let end_pos = rx.get();
if end_pos > start_pos {
return Ok(end_pos);
}
if let EndWatch::Flush(rx) = rx {
let curr_term = rx.borrow().term;
if let Some(client_term) = client_term {
if curr_term != client_term {
bail!("term changed: requested {}, now {}", client_term, curr_term);
}
}
}
rx.changed().await?;
commit_lsn = *rx.borrow();
if commit_lsn > lsn {
break;
}
}
Ok(commit_lsn)
})
.await;

View File

@@ -3,11 +3,8 @@
use anyhow::{anyhow, bail, Result};
use postgres_ffi::XLogSegNo;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use tokio::fs;
use serde_with::DisplayFromStr;
use std::cmp::max;
use std::path::PathBuf;
use std::sync::Arc;
@@ -26,11 +23,9 @@ use utils::{
use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use crate::receive_wal::WalReceivers;
use crate::recovery::recovery_main;
use crate::safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
SafekeeperMemState, ServerInfo, Term,
};
use crate::send_wal::WalSenders;
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
@@ -41,25 +36,18 @@ use crate::SafeKeeperConf;
use crate::{debug_dump, wal_storage};
/// Things safekeeper should know about timeline state on peers.
#[serde_as]
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone)]
pub struct PeerInfo {
pub sk_id: NodeId,
/// Term of the last entry.
_last_log_term: Term,
/// LSN of the last record.
#[serde_as(as = "DisplayFromStr")]
_flush_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub commit_lsn: Lsn,
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
/// sk since backup_lsn.
#[serde_as(as = "DisplayFromStr")]
pub local_start_lsn: Lsn,
/// When info was received. Serde annotations are not very useful but make
/// the code compile -- we don't rely on this field externally.
#[serde(skip)]
#[serde(default = "Instant::now")]
/// When info was received.
ts: Instant,
}
@@ -176,8 +164,8 @@ impl SharedState {
})
}
fn is_active(&self, num_computes: usize, remote_consistent_lsn: Lsn) -> bool {
self.is_wal_backup_required(num_computes)
fn is_active(&self, remote_consistent_lsn: Lsn) -> bool {
self.is_wal_backup_required()
// FIXME: add tracking of relevant pageservers and check them here individually,
// otherwise migration won't work (we suspend too early).
|| remote_consistent_lsn < self.sk.inmem.commit_lsn
@@ -185,34 +173,29 @@ impl SharedState {
/// Mark timeline active/inactive and return whether s3 offloading requires
/// start/stop action.
fn update_status(
&mut self,
num_computes: usize,
remote_consistent_lsn: Lsn,
ttid: TenantTimelineId,
) -> bool {
let is_active = self.is_active(num_computes, remote_consistent_lsn);
fn update_status(&mut self, remote_consistent_lsn: Lsn, ttid: TenantTimelineId) -> bool {
let is_active = self.is_active(remote_consistent_lsn);
if self.active != is_active {
info!("timeline {} active={} now", ttid, is_active);
}
self.active = is_active;
self.is_wal_backup_action_pending(num_computes)
self.is_wal_backup_action_pending()
}
/// Should we run s3 offloading in current state?
fn is_wal_backup_required(&self, num_computes: usize) -> bool {
fn is_wal_backup_required(&self) -> bool {
let seg_size = self.get_wal_seg_size();
num_computes > 0 ||
self.num_computes > 0 ||
// Currently only the whole segment is offloaded, so compare segment numbers.
(self.sk.inmem.commit_lsn.segment_number(seg_size) >
self.sk.inmem.backup_lsn.segment_number(seg_size))
(self.sk.inmem.commit_lsn.segment_number(seg_size) >
self.sk.inmem.backup_lsn.segment_number(seg_size))
}
/// Is current state of s3 offloading is not what it ought to be?
fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
fn is_wal_backup_action_pending(&self) -> bool {
let res = self.wal_backup_active != self.is_wal_backup_required();
if res {
let action_pending = if self.is_wal_backup_required(num_computes) {
let action_pending = if self.is_wal_backup_required() {
"start"
} else {
"stop"
@@ -227,8 +210,8 @@ impl SharedState {
/// Returns whether s3 offloading is required and sets current status as
/// matching.
fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
self.wal_backup_active = self.is_wal_backup_required(num_computes);
fn wal_backup_attend(&mut self) -> bool {
self.wal_backup_active = self.is_wal_backup_required();
self.wal_backup_active
}
@@ -248,9 +231,8 @@ impl SharedState {
tenant_id: ttid.tenant_id.as_ref().to_owned(),
timeline_id: ttid.timeline_id.as_ref().to_owned(),
}),
term: self.sk.state.acceptor_state.term,
last_log_term: self.sk.get_epoch(),
flush_lsn: self.sk.flush_lsn().0,
flush_lsn: self.sk.wal_store.flush_lsn().0,
// note: this value is not flushed to control file yet and can be lost
commit_lsn: self.sk.inmem.commit_lsn.0,
remote_consistent_lsn: remote_consistent_lsn.0,
@@ -259,7 +241,6 @@ impl SharedState {
.advertise_pg_addr
.to_owned()
.unwrap_or(conf.listen_pg_addr.clone()),
http_connstr: conf.listen_http_addr.to_owned(),
backup_lsn: self.sk.inmem.backup_lsn.0,
local_start_lsn: self.sk.state.local_start_lsn.0,
availability_zone: conf.availability_zone.clone(),
@@ -309,19 +290,11 @@ pub struct Timeline {
commit_lsn_watch_tx: watch::Sender<Lsn>,
commit_lsn_watch_rx: watch::Receiver<Lsn>,
/// Broadcasts (current term, flush_lsn) updates, walsender is interested in
/// them when sending in recovery mode (to walproposer or peers). Note: this
/// is just a notification, WAL reading should always done with lock held as
/// term can change otherwise.
term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
/// Safekeeper and other state, that should remain consistent and
/// synchronized with the disk. This is tokio mutex as we write WAL to disk
/// while holding it, ensuring that consensus checks are in order.
mutex: Mutex<SharedState>,
walsenders: Arc<WalSenders>,
walreceivers: Arc<WalReceivers>,
/// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
cancellation_tx: watch::Sender<bool>,
@@ -337,20 +310,16 @@ pub struct Timeline {
impl Timeline {
/// Load existing timeline from disk.
pub fn load_timeline(
conf: &SafeKeeperConf,
conf: SafeKeeperConf,
ttid: TenantTimelineId,
wal_backup_launcher_tx: Sender<TenantTimelineId>,
) -> Result<Timeline> {
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
let shared_state = SharedState::restore(conf, &ttid)?;
let shared_state = SharedState::restore(&conf, &ttid)?;
let rcl = shared_state.sk.state.remote_consistent_lsn;
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
watch::channel(shared_state.sk.state.commit_lsn);
let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((
shared_state.sk.get_term(),
shared_state.sk.flush_lsn(),
)));
let (cancellation_tx, cancellation_rx) = watch::channel(false);
Ok(Timeline {
@@ -358,11 +327,8 @@ impl Timeline {
wal_backup_launcher_tx,
commit_lsn_watch_tx,
commit_lsn_watch_rx,
term_flush_lsn_watch_tx,
term_flush_lsn_watch_rx,
mutex: Mutex::new(shared_state),
walsenders: WalSenders::new(rcl),
walreceivers: WalReceivers::new(),
cancellation_rx,
cancellation_tx,
timeline_dir: conf.timeline_dir(&ttid),
@@ -371,7 +337,7 @@ impl Timeline {
/// Create a new timeline, which is not yet persisted to disk.
pub fn create_empty(
conf: &SafeKeeperConf,
conf: SafeKeeperConf,
ttid: TenantTimelineId,
wal_backup_launcher_tx: Sender<TenantTimelineId>,
server_info: ServerInfo,
@@ -379,8 +345,6 @@ impl Timeline {
local_start_lsn: Lsn,
) -> Result<Timeline> {
let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
let (cancellation_tx, cancellation_rx) = watch::channel(false);
let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
@@ -389,27 +353,20 @@ impl Timeline {
wal_backup_launcher_tx,
commit_lsn_watch_tx,
commit_lsn_watch_rx,
term_flush_lsn_watch_tx,
term_flush_lsn_watch_rx,
mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
walsenders: WalSenders::new(Lsn(0)),
walreceivers: WalReceivers::new(),
cancellation_rx,
cancellation_tx,
timeline_dir: conf.timeline_dir(&ttid),
})
}
/// Initialize fresh timeline on disk and start background tasks. If init
/// Initialize fresh timeline on disk and start background tasks. If bootstrap
/// fails, timeline is cancelled and cannot be used anymore.
///
/// Init is transactional, so if it fails, created files will be deleted,
/// Bootstrap is transactional, so if it fails, created files will be deleted,
/// and state on disk should remain unchanged.
pub async fn init_new(
self: &Arc<Timeline>,
shared_state: &mut MutexGuard<'_, SharedState>,
conf: &SafeKeeperConf,
) -> Result<()> {
pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> {
match fs::metadata(&self.timeline_dir).await {
Ok(_) => {
// Timeline directory exists on disk, we should leave state unchanged
@@ -425,7 +382,7 @@ impl Timeline {
// Create timeline directory.
fs::create_dir_all(&self.timeline_dir).await?;
// Write timeline to disk and start background tasks.
// Write timeline to disk and TODO: start background tasks.
if let Err(e) = shared_state.sk.persist().await {
// Bootstrap failed, cancel timeline and remove timeline directory.
self.cancel(shared_state);
@@ -439,14 +396,10 @@ impl Timeline {
return Err(e);
}
self.bootstrap(conf);
Ok(())
}
/// Bootstrap new or existing timeline starting background stasks.
pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
// Start recovery task which always runs on the timeline.
tokio::spawn(recovery_main(self.clone(), conf.clone()));
// TODO: add more initialization steps here
self.update_status(shared_state);
Ok(())
}
/// Delete timeline from disk completely, by removing timeline directory. Background
@@ -482,38 +435,46 @@ impl Timeline {
*self.cancellation_rx.borrow()
}
/// Returns watch channel which gets value when timeline is cancelled. It is
/// guaranteed to have not cancelled value observed (errors otherwise).
pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
let rx = self.cancellation_rx.clone();
if *rx.borrow() {
bail!(TimelineError::Cancelled(self.ttid));
}
Ok(rx)
}
/// Take a writing mutual exclusive lock on timeline shared_state.
pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
self.mutex.lock().await
}
fn update_status(&self, shared_state: &mut SharedState) -> bool {
shared_state.update_status(
self.walreceivers.get_num(),
self.get_walsenders().get_remote_consistent_lsn(),
self.ttid,
)
shared_state.update_status(self.get_walsenders().get_remote_consistent_lsn(), self.ttid)
}
/// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
pub async fn update_status_notify(&self) -> Result<()> {
/// Register compute connection, starting timeline-related activity if it is
/// not running yet.
pub async fn on_compute_connect(&self) -> Result<()> {
if self.is_cancelled() {
bail!(TimelineError::Cancelled(self.ttid));
}
let is_wal_backup_action_pending: bool = {
let is_wal_backup_action_pending: bool;
{
let mut shared_state = self.write_shared_state().await;
self.update_status(&mut shared_state)
};
shared_state.num_computes += 1;
is_wal_backup_action_pending = self.update_status(&mut shared_state);
}
// Wake up wal backup launcher, if offloading not started yet.
if is_wal_backup_action_pending {
// Can fail only if channel to a static thread got closed, which is not normal at all.
self.wal_backup_launcher_tx.send(self.ttid).await?;
}
Ok(())
}
/// De-register compute connection, shutting down timeline activity if
/// pageserver doesn't need catchup.
pub async fn on_compute_disconnect(&self) -> Result<()> {
let is_wal_backup_action_pending: bool;
{
let mut shared_state = self.write_shared_state().await;
shared_state.num_computes -= 1;
is_wal_backup_action_pending = self.update_status(&mut shared_state);
}
// Wake up wal backup launcher, if it is time to stop the offloading.
if is_wal_backup_action_pending {
// Can fail only if channel to a static thread got closed, which is not normal at all.
self.wal_backup_launcher_tx.send(self.ttid).await?;
@@ -558,9 +519,7 @@ impl Timeline {
return false;
}
self.write_shared_state()
.await
.wal_backup_attend(self.walreceivers.get_num())
self.write_shared_state().await.wal_backup_attend()
}
/// Returns commit_lsn watch channel.
@@ -568,11 +527,6 @@ impl Timeline {
self.commit_lsn_watch_rx.clone()
}
/// Returns term_flush_lsn watch channel.
pub fn get_term_flush_lsn_watch_rx(&self) -> watch::Receiver<TermLsn> {
self.term_flush_lsn_watch_rx.clone()
}
/// Pass arrived message to the safekeeper.
pub async fn process_msg(
&self,
@@ -584,7 +538,6 @@ impl Timeline {
let mut rmsg: Option<AcceptorProposerMessage>;
let commit_lsn: Lsn;
let term_flush_lsn: TermLsn;
{
let mut shared_state = self.write_shared_state().await;
rmsg = shared_state.sk.process_msg(msg).await?;
@@ -598,11 +551,8 @@ impl Timeline {
}
commit_lsn = shared_state.sk.inmem.commit_lsn;
term_flush_lsn =
TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
}
self.commit_lsn_watch_tx.send(commit_lsn)?;
self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
Ok(rmsg)
}
@@ -700,10 +650,6 @@ impl Timeline {
&self.walsenders
}
pub fn get_walreceivers(&self) -> &Arc<WalReceivers> {
&self.walreceivers
}
/// Returns flush_lsn.
pub async fn get_flush_lsn(&self) -> Lsn {
self.write_shared_state().await.sk.wal_store.flush_lsn()

View File

@@ -11,7 +11,7 @@ use serde::Serialize;
use std::collections::HashMap;
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::{Arc, Mutex};
use std::sync::{Arc, Mutex, MutexGuard};
use tokio::sync::mpsc::Sender;
use tracing::*;
use utils::id::{TenantId, TenantTimelineId, TimelineId};
@@ -71,23 +71,19 @@ pub struct GlobalTimelines;
impl GlobalTimelines {
/// Inject dependencies needed for the timeline constructors and load all timelines to memory.
pub async fn init(
pub fn init(
conf: SafeKeeperConf,
wal_backup_launcher_tx: Sender<TenantTimelineId>,
) -> Result<()> {
// clippy isn't smart enough to understand that drop(state) releases the
// lock, so use explicit block
let tenants_dir = {
let mut state = TIMELINES_STATE.lock().unwrap();
assert!(state.wal_backup_launcher_tx.is_none());
state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
state.conf = Some(conf);
let mut state = TIMELINES_STATE.lock().unwrap();
assert!(state.wal_backup_launcher_tx.is_none());
state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
state.conf = Some(conf);
// Iterate through all directories and load tenants for all directories
// named as a valid tenant_id.
state.get_conf().workdir.clone()
};
// Iterate through all directories and load tenants for all directories
// named as a valid tenant_id.
let mut tenant_count = 0;
let tenants_dir = state.get_conf().workdir.clone();
for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
.with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))?
{
@@ -97,7 +93,7 @@ impl GlobalTimelines {
TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
{
tenant_count += 1;
GlobalTimelines::load_tenant_timelines(tenant_id).await?;
GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?;
}
}
Err(e) => error!(
@@ -112,7 +108,7 @@ impl GlobalTimelines {
info!(
"found {} tenants directories, successfully loaded {} timelines",
tenant_count,
TIMELINES_STATE.lock().unwrap().timelines.len()
state.timelines.len()
);
Ok(())
}
@@ -120,21 +116,17 @@ impl GlobalTimelines {
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir
/// errors if any.
///
/// It is async for update_status_notify sake. Since TIMELINES_STATE lock is
/// sync and there is no important reason to make it async (it is always
/// held for a short while) we just lock and unlock it for each timeline --
/// this function is called during init when nothing else is running, so
/// this is fine.
async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
let (conf, wal_backup_launcher_tx) = {
let state = TIMELINES_STATE.lock().unwrap();
(
state.get_conf().clone(),
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
)
};
let timelines_dir = conf.tenant_dir(&tenant_id);
/// Note: This function (and all reading/loading below) is sync because
/// timelines are loaded while holding GlobalTimelinesState lock. Which is
/// fine as this is called only from single threaded main runtime on boot,
/// but clippy complains anyway, and suppressing that isn't trivial as async
/// is the keyword, ha. That only other user is pull_timeline.rs for which
/// being blocked is not that bad, and we can do spawn_blocking.
fn load_tenant_timelines(
state: &mut MutexGuard<'_, GlobalTimelinesState>,
tenant_id: TenantId,
) -> Result<()> {
let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
.with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))?
{
@@ -144,16 +136,13 @@ impl GlobalTimelines {
TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
{
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
match Timeline::load_timeline(
state.get_conf().clone(),
ttid,
state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
) {
Ok(timeline) => {
let tli = Arc::new(timeline);
TIMELINES_STATE
.lock()
.unwrap()
.timelines
.insert(ttid, tli.clone());
tli.bootstrap(&conf);
tli.update_status_notify().await.unwrap();
state.timelines.insert(ttid, Arc::new(timeline));
}
// If we can't load a timeline, it's most likely because of a corrupted
// directory. We will log an error and won't allow to delete/recreate
@@ -179,22 +168,18 @@ impl GlobalTimelines {
}
/// Load timeline from disk to the memory.
pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
pub fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
match Timeline::load_timeline(conf, ttid, wal_backup_launcher_tx) {
Ok(timeline) => {
let tli = Arc::new(timeline);
// TODO: prevent concurrent timeline creation/loading
TIMELINES_STATE
.lock()
.unwrap()
.timelines
.insert(ttid, tli.clone());
tli.bootstrap(&conf);
Ok(tli)
}
// If we can't load a timeline, it's bad. Caller will figure it out.
@@ -232,7 +217,7 @@ impl GlobalTimelines {
info!("creating new timeline {}", ttid);
let timeline = Arc::new(Timeline::create_empty(
&conf,
conf,
ttid,
wal_backup_launcher_tx,
server_info,
@@ -255,24 +240,23 @@ impl GlobalTimelines {
// Write the new timeline to the disk and start background workers.
// Bootstrap is transactional, so if it fails, the timeline will be deleted,
// and the state on disk should remain unchanged.
if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
// Note: the most likely reason for init failure is that the timeline
if let Err(e) = timeline.bootstrap(&mut shared_state).await {
// Note: the most likely reason for bootstrap failure is that the timeline
// directory already exists on disk. This happens when timeline is corrupted
// and wasn't loaded from disk on startup because of that. We want to preserve
// the timeline directory in this case, for further inspection.
// TODO: this is an unusual error, perhaps we should send it to sentry
// TODO: compute will try to create timeline every second, we should add backoff
error!("failed to init new timeline {}: {}", ttid, e);
error!("failed to bootstrap timeline {}: {}", ttid, e);
// Timeline failed to init, it cannot be used. Remove it from the map.
// Timeline failed to bootstrap, it cannot be used. Remove it from the map.
TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid);
return Err(e);
}
// We are done with bootstrap, release the lock, return the timeline.
// {} block forces release before .await
}
timeline.update_status_notify().await?;
timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
Ok(timeline)
}

View File

@@ -0,0 +1,76 @@
#! /usr/bin/env python3
# Script to generate ext_index.json metadata file
# that stores content of the control files and location of extension archives
# for all extensions in extensions subdir.
import argparse
import json
import subprocess
from pathlib import Path
"""
# ext_index.json example:
{
"public_extensions": [
"anon"
],
"library_index": {
"anon": "anon",
// for more complex extensions like postgis
// we might have something like:
// address_standardizer: postgis
// postgis_tiger: postgis
},
"extension_data": {
"anon": {
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
},
"archive_path": "5648391853/v15/extensions/anon.tar.zst"
}
}
}
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="generate ext_index.json")
parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
parser.add_argument("--public_extensions", type=str, help="list of public extensions")
args = parser.parse_args()
pg_version = args.pg_version
BUILD_TAG = args.BUILD_TAG
public_ext_list = args.public_extensions.split(",")
ext_index = {}
library_index = {}
EXT_PATH = Path("extensions")
for extension in EXT_PATH.iterdir():
if extension.is_dir():
control_data = {}
for control_file in extension.glob("*.control"):
if control_file.suffix != ".control":
continue
with open(control_file, "r") as f:
control_data[control_file.name] = f.read()
ext_index[extension.name] = {
"control_data": control_data,
"archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
}
elif extension.suffix == ".zst":
file_list = (
str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
.strip()
.split("\n")
)
for file in file_list:
if file.endswith(".so") and file.startswith("lib/"):
lib_name = file[4:-3]
library_index[lib_name] = extension.name.replace(".tar.zst", "")
all_data = {
"public_extensions": public_ext_list,
"library_index": library_index,
"extension_data": ext_index,
}
with open("ext_index.json", "w") as f:
json.dump(all_data, f)

View File

@@ -12,26 +12,25 @@ import psycopg2.extras
# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
FLAKY_TESTS_QUERY = """
SELECT
DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
DISTINCT parent_suite, suite, test
FROM
(
SELECT
reference,
jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
revision,
jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
FROM
regress_test_results
WHERE
reference = 'refs/heads/main'
) data
WHERE
timestamp > CURRENT_DATE - INTERVAL '%s' day
AND (
(status IN ('failed', 'broken') AND reference = 'refs/heads/main')
OR retries_status_change::boolean
)
AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
;
"""
@@ -41,9 +40,6 @@ def main(args: argparse.Namespace):
interval_days = args.days
output = args.output
build_type = args.build_type
pg_version = args.pg_version
res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
res = defaultdict(lambda: defaultdict(dict))
@@ -59,21 +55,8 @@ def main(args: argparse.Namespace):
rows = []
for row in rows:
# We don't want to automatically rerun tests in a performance suite
if row["parent_suite"] != "test_runner.regress":
continue
deparametrized_test = row["deparametrized_test"]
dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
parametrized_test = deparametrized_test.replace(
"[",
f"[{build_type}-pg{pg_version}{dash_if_needed}",
)
res[row["parent_suite"]][row["suite"]][parametrized_test] = True
logging.info(
f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
)
logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
res[row["parent_suite"]][row["suite"]][row["test"]] = True
logging.info(f"saving results to {output.name}")
json.dump(res, output, indent=2)
@@ -94,18 +77,6 @@ if __name__ == "__main__":
type=int,
help="how many days to look back for flaky tests (default: 10)",
)
parser.add_argument(
"--build-type",
required=True,
type=str,
help="for which build type to create list of flaky tests (debug or release)",
)
parser.add_argument(
"--pg-version",
required=True,
type=int,
help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
)
parser.add_argument(
"connstr",
help="connection string to the test results database",

581
scripts/plumber.py Normal file
View File

@@ -0,0 +1,581 @@
import argparse
import asyncio
import enum
import json
import os
import pprint
import tempfile
from asyncio import subprocess
from datetime import date, datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Set
"""
This is the automation tool that was mostly helpful during our big aws account migration,
but may be helpful in other day to day tasks and concentrate knowledge about operations
that can help during on-call.
This script executes commands on remote using ssh multiplexing. See referenes:
https://blog.scottlowe.org/2015/12/11/using-ssh-multiplexing/
https://github.com/openssh-rust/openssh/blob/master/src/builder.rs
https://github.com/openssh-rust/openssh/blob/master/src/process_impl/session.rs
https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Multiplexing
https://docs.rs/openssh/0.9.8/openssh/
For use with teleport you'll need to setup nsh script mentioned here:
https://github.com/neondatabase/cloud/wiki/Cloud%3A-access#3-access-the-nodes-with-ssm
"""
def show_line(output_label: Optional[str], line: str):
if output_label is not None:
print(f"({output_label})", line, end="")
else:
print(" ", line, end="")
if not line:
print()
async def exec_checked(
program: str,
args: List[str],
err_msg: Optional[str] = None,
output_label: Optional[str] = None,
show_output: bool = True,
expected_exit_codes=frozenset((0,)),
) -> List[str]:
if show_output:
print("+", program, *args)
proc = await subprocess.create_subprocess_exec(
program,
*args,
stdout=asyncio.subprocess.PIPE,
limit=10 << 20,
)
assert proc.stdout is not None
out = []
line = (await proc.stdout.readline()).decode()
if show_output:
show_line(output_label, line)
out.append(line)
while line:
line = (await proc.stdout.readline()).decode()
# empty line means eof, actual empty line from the program is represented by "\n"
if not line:
continue
if show_output:
show_line(output_label, line)
out.append(line)
exit_code = await proc.wait()
assert exit_code in expected_exit_codes, err_msg or f"{program} failed with {exit_code}"
return out
class Connection:
def __init__(
self,
tempdir: tempfile.TemporaryDirectory, # type: ignore
target: str,
):
self.tempdir = tempdir
self.target = target
def get_args(self, extra_args: List[str]):
ctl_path = os.path.join(self.tempdir.name, "master")
return ["-S", ctl_path, "-o", "BatchMode=yes", *extra_args, "none"]
async def check(self):
args = self.get_args(["-O", "check"])
await exec_checked("ssh", args, err_msg="master check operation failed")
async def spawn(self, cmd: str):
# https://github.com/openssh-rust/openssh/blob/cd8f174fafc530d8e55c2aa63add14a24cb2b94c/src/process_impl/session.rs#L72
local_args = self.get_args(["-T", "-p", "9"])
local_args.extend(["--", f"bash -c '{cmd}'"])
return await exec_checked(
"ssh", local_args, err_msg="spawn failed", output_label=self.target
)
async def close(self):
args = self.get_args(["-O", "exit"])
await exec_checked("ssh", args, err_msg="master exit operation failed")
async def connect(target: str) -> Connection:
"""
target is directly passed to ssh command
"""
# NOTE: it is mentioned that this setup is not secure
# For better security it should be placed somewhere in ~/.ssh
# or in other directory with proper permissions
# openssh-rust does it the same way
# https://github.com/openssh-rust/openssh/blob/master/src/builder.rs
connection_dir = tempfile.TemporaryDirectory(suffix=".ssh-multiplexed")
# "-E logfile"
await exec_checked(
"ssh",
[
"-S",
os.path.join(connection_dir.name, "master"),
"-M", # Places the ssh client into “master” mode for connection sharing.
"-f", # Requests ssh to go to background just before command execution.
"-N", # Do not execute a remote command. This is useful for just forwarding ports.
"-o",
"BatchMode=yes",
target,
],
err_msg="starting master process failed",
)
return Connection(tempdir=connection_dir, target=target)
class Timer:
def __init__(self, msg: str) -> None:
self.t0 = datetime.now()
self.msg = msg
def __enter__(self):
return None
def __exit__(self, *_):
print(self.msg, datetime.now() - self.t0)
def parse_date(s: str) -> date:
return datetime.strptime(s, "%Y-%m-%d").date()
def write_line(f, line: str):
f.write(line)
f.write("\n")
async def pageserver_tenant_sizes(
pageserver_target: str, tenants_of_interest: Optional[List[str]] = None
) -> Dict[str, int]:
"""
With ondemand it should rather look at physical size api
For old projects since we dont have eviction yet,
we can look at local fs state.
"""
if tenants_of_interest is not None:
tenants_of_interest = set(tenants_of_interest) # type: ignore
ps_connection = await connect(pageserver_target)
out = await ps_connection.spawn("du -sb /storage/pageserver/data/tenants/* | sort -rh")
tenants = {}
for line in out:
if line.startswith("du: cannot read directory"):
continue
size, tenant_path = map(str.strip, line.split())
tenant = Path(tenant_path).stem
if tenants_of_interest is not None:
if tenant not in tenants_of_interest:
continue
tenants[tenant] = int(size)
return tenants
async def fetch_ps_size(args):
if args.input is not None:
tenants = Path(args.input).read_text().splitlines()
else:
tenants = None
sizes = await pageserver_tenant_sizes(args.target, tenants_of_interest=tenants)
total = 0
for tenant, size in sorted(sizes.items(), key=lambda x: x[1], reverse=True):
total += size
print(tenant, size)
print("total", total)
@enum.unique
class Env(enum.Enum):
STAGING = "staging"
PRODUCTION = "production"
class ConsoleAdminShortcuts:
def __init__(self, env: Env, verbose: bool = False):
if env is Env.STAGING:
self.admin_base_url = "https://console.neon.tech/api/v1"
self.management_base_url = "http://console-staging.local:3440/management/api/v2"
elif env is Env.PRODUCTION:
self.admin_base_url = "https://console.neon.tech"
self.management_base_url = "http://console-release.local:3441/management/api/v2"
self.api_token = os.getenv("CONSOLE_ADMIN_API_TOKEN")
assert self.api_token, '"CONSOLE_ADMIN_API_TOKEN" is missing in env'
self.verbose = verbose
async def check_availability(self, project_id: str):
url = f"{self.admin_base_url}/admin/projects/{project_id}/check_availability"
output = await exec_checked(
"curl",
[
"--silent",
"--fail",
"-XPOST",
url,
"-H",
f"Authorization: Bearer {self.api_token}",
"-H",
"Accept: application/json",
],
show_output=self.verbose,
)
assert len(output) == 1 # output should be one line of json
return json.loads(output.pop())
async def get_operation(self, operation_id: str):
url = f"{self.admin_base_url}/admin/operations/{operation_id}"
output = await exec_checked(
"curl",
[
"--silent",
"--fail",
url,
"-H",
f"Authorization: Bearer {self.api_token}",
"-H",
"Accept: application/json",
],
show_output=self.verbose,
)
assert len(output) == 1 # output should be one line of json
return json.loads(output.pop())
async def get_pageservers(self):
url = f"{self.admin_base_url}/admin/pageservers"
output = await exec_checked(
"curl",
[
"--silent",
"--fail",
url,
"-H",
f"Authorization: Bearer {self.api_token}",
"-H",
"Accept: application/json",
],
show_output=self.verbose,
)
assert len(output) == 1 # output should be one line of json
return json.loads(output.pop())
async def set_maintenance(self, project_id: str, maintenance: bool) -> Dict[str, Any]:
"""
Example response:
{
"project": {
"id": "tight-wood-864662",
"maintenance_set_at": "2023-01-31T13:36:45.90346Z"
},
"operations": [
{
"id": "216142e0-fbb7-4f41-a470-e63408d4d6b4"
}
]
}
"""
url = f"{self.management_base_url}/projects/{project_id}/maintenance"
data = json.dumps({"maintenance": maintenance})
if not self.verbose:
args = ["--silent"]
else:
args = []
args.extend(
[
"--fail",
"-XPUT",
url,
"-H",
f"Authorization: Bearer {self.api_token}",
"-H",
"Accept: application/json",
"-d",
data,
]
)
output = await exec_checked(
"curl",
[],
show_output=self.verbose,
)
assert len(output) == 1 # output should be one line of json
ret = json.loads(output.pop())
assert isinstance(ret, Dict)
return ret
async def fetch_branches(self, project_id: str):
url = f"{self.admin_base_url}/admin/branches?project_id={project_id}"
output = await exec_checked(
"curl",
[
"--silent",
"--fail",
url,
"-H",
f"Authorization: Bearer {self.api_token}",
"-H",
"Accept: application/json",
],
show_output=self.verbose,
)
assert len(output) == 1 # output should be one line of json
return json.loads(output.pop())
async def poll_pending_ops(console: ConsoleAdminShortcuts, pending_ops: Set[str]):
finished = set() # needed because sets cannot be changed during iteration
for pending_op in pending_ops:
data = await console.get_operation(pending_op)
operation = data["operation"]
status = operation["status"]
if status == "failed":
print(f"ERROR: operation {pending_op} failed")
continue
if operation["failures_count"] != 0:
print(f"WARN: operation {pending_op} has failures != 0")
continue
if status == "finished":
print(f"operation {pending_op} finished")
finished.add(pending_op)
else:
print(f"operation {pending_op} is still pending: {status}")
pending_ops.difference_update(finished)
async def check_availability(args):
console = ConsoleAdminShortcuts(env=Env(args.env))
max_concurrent_checks = args.max_concurrent_checks
# reverse to keep the order because we will be popping from the end
projects: List[str] = list(reversed(Path(args.input).read_text().splitlines()))
print("n_projects", len(projects))
pending_ops: Set[str] = set()
while projects:
# walk through pending ops
if pending_ops:
print("pending", len(pending_ops), pending_ops)
await poll_pending_ops(console, pending_ops)
# schedule new ops if limit allows
while len(pending_ops) < max_concurrent_checks and len(projects) > 0:
project = projects.pop()
print("starting:", project, len(projects))
# there can be many operations, one for each endpoint
data = await console.check_availability(project)
for operation in data["operations"]:
pending_ops.add(operation["ID"])
# wait a bit before starting next one
await asyncio.sleep(2)
if projects:
# sleep a little bit to give operations time to finish
await asyncio.sleep(5)
print("all scheduled, poll pending", len(pending_ops), pending_ops, projects)
while pending_ops:
await poll_pending_ops(console, pending_ops)
await asyncio.sleep(5)
async def maintain(args):
console = ConsoleAdminShortcuts(env=Env(args.env))
finish_flag = args.finish
projects: List[str] = Path(args.input).read_text().splitlines()
print("n_projects", len(projects))
pending_ops: Set[str] = set()
for project in projects:
data = await console.set_maintenance(project, maintenance=not finish_flag)
print(project, len(data["operations"]))
for operation in data["operations"]:
pending_ops.add(operation["id"])
if finish_flag:
assert len(pending_ops) == 0
return
print("all scheduled, poll pending", len(pending_ops), pending_ops)
while pending_ops:
await poll_pending_ops(console, pending_ops)
print("n pending ops:", len(pending_ops))
if pending_ops:
await asyncio.sleep(5)
SOURCE_BUCKET = "zenith-storage-oregon"
AWS_REGION = "us-west-2"
SAFEKEEPER_SOURCE_PREFIX_IN_BUCKET = "prod-1/wal"
async def fetch_sk_s3_size(args):
tenants: List[str] = Path(args.input).read_text().splitlines()
total_objects = 0
total_size = 0
for tenant in tenants:
wal_prefix = f"s3://{SOURCE_BUCKET}/{SAFEKEEPER_SOURCE_PREFIX_IN_BUCKET}/{tenant}"
result = await exec_checked(
"aws",
[
"--profile",
"neon_main",
"s3",
"ls",
"--recursive",
"--summarize",
wal_prefix,
],
expected_exit_codes={0, 1},
show_output=False,
)
objects = int(result[-2].rsplit(maxsplit=1).pop())
total_objects += objects
size = int(result[-1].rsplit(maxsplit=1).pop())
total_size += size
print(tenant, "objects", objects, "size", size)
print("total_objects", total_objects, "total_size", total_size)
async def fetch_branches(args):
console = ConsoleAdminShortcuts(env=Env(args.env))
project_id = args.project_id
pprint.pprint(await console.fetch_branches(project_id=project_id))
async def get_pageservers(args):
console = ConsoleAdminShortcuts(env=Env(args.env))
pprint.pprint(await console.get_pageservers())
async def main():
parser = argparse.ArgumentParser("migrator")
sub = parser.add_subparsers(title="commands", dest="subparser_name")
split_parser = sub.add_parser(
"split",
)
split_parser.add_argument(
"--input",
help="CSV file with results from snowflake query mentioned in README.",
required=True,
)
split_parser.add_argument(
"--out",
help="Directory to store groups of projects. Directory name is pageserver id.",
required=True,
)
split_parser.add_argument(
"--last-usage-cutoff",
dest="last_usage_cutoff",
help="Projects which do not have compute time starting from passed date (e g 2022-12-01) wil be considered not used recently",
required=True,
)
split_parser.add_argument(
"--select-pageserver-id",
help="Filter input for this pageserver id",
required=True,
)
fetch_ps_size_parser = sub.add_parser("fetch-ps-size")
fetch_ps_size_parser.add_argument(
"--target",
help="Target pageserver host as resolvable by ssh",
required=True,
)
fetch_ps_size_parser.add_argument(
"--input",
help="File containing list of tenants to include",
)
check_availability_parser = sub.add_parser("check-availability")
check_availability_parser.add_argument(
"--input",
help="File containing list of projects to run availability checks for",
)
check_availability_parser.add_argument(
"--env", choices=["staging", "production"], default="staging"
)
check_availability_parser.add_argument(
"--max-concurrent-checks",
help="Max number of simultaneously active availability checks",
type=int,
default=50,
)
maintain_parser = sub.add_parser("maintain")
maintain_parser.add_argument(
"--input",
help="File containing list of projects",
)
maintain_parser.add_argument("--env", choices=["staging", "production"], default="staging")
maintain_parser.add_argument(
"--finish",
action="store_true",
)
fetch_sk_s3_size_parser = sub.add_parser("fetch-sk-s3-size")
fetch_sk_s3_size_parser.add_argument(
"--input",
help="File containing list of tenants",
)
fetch_branches_parser = sub.add_parser("fetch-branches")
fetch_branches_parser.add_argument("--project-id")
fetch_branches_parser.add_argument(
"--env", choices=["staging", "production"], default="staging"
)
get_pageservers_parser = sub.add_parser("get-pageservers")
get_pageservers_parser.add_argument(
"--env", choices=["staging", "production"], default="staging"
)
args = parser.parse_args()
handlers = {
"fetch-ps-size": fetch_ps_size,
"check-availability": check_availability,
"maintain": maintain,
"fetch-sk-s3-size": fetch_sk_s3_size,
"fetch-branches": fetch_branches,
"get-pageservers": get_pageservers,
}
handler = handlers.get(args.subparser_name)
if handler:
await handler(args)
else:
parser.print_help()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -125,7 +125,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
tenant_id: vec![0xFF; 16],
timeline_id: tli_from_u64(counter % n_keys),
}),
term: 0,
last_log_term: 0,
flush_lsn: counter,
commit_lsn: 2,
@@ -133,7 +132,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
remote_consistent_lsn: 4,
peer_horizon_lsn: 5,
safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
local_start_lsn: 0,
availability_zone: None,
};

View File

@@ -22,8 +22,6 @@ message SubscribeSafekeeperInfoRequest {
message SafekeeperTimelineInfo {
uint64 safekeeper_id = 1;
TenantTimelineId tenant_timeline_id = 2;
// Safekeeper term
uint64 term = 12;
// Term of the last entry.
uint64 last_log_term = 3;
// LSN of the last record.
@@ -38,8 +36,6 @@ message SafekeeperTimelineInfo {
uint64 local_start_lsn = 9;
// A connection string to use for WAL receiving.
string safekeeper_connstr = 10;
// HTTP endpoint connection string
string http_connstr = 13;
// Availability zone of a safekeeper.
optional string availability_zone = 11;
}

View File

@@ -519,7 +519,6 @@ mod tests {
tenant_id: vec![0x00; 16],
timeline_id,
}),
term: 0,
last_log_term: 0,
flush_lsn: 1,
commit_lsn: 2,
@@ -527,7 +526,6 @@ mod tests {
remote_consistent_lsn: 4,
peer_horizon_lsn: 5,
safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
http_connstr: "neon-1-sk-1.local:7677".to_owned(),
local_start_lsn: 0,
availability_zone: None,
}

View File

@@ -70,7 +70,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
"pageserver_getpage_reconstruct_seconds_count",
"pageserver_getpage_reconstruct_seconds_sum",
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
*histogram("pageserver_smgr_query_seconds_global"),
*histogram("pageserver_read_num_fs_layers"),
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
*histogram("pageserver_wait_lsn_seconds"),

View File

@@ -427,7 +427,6 @@ class NeonEnvBuilder:
default_branch_name: str = DEFAULT_BRANCH_NAME,
preserve_database_files: bool = False,
initial_tenant: Optional[TenantId] = None,
initial_timeline: Optional[TimelineId] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
@@ -453,7 +452,6 @@ class NeonEnvBuilder:
self.pg_version = pg_version
self.preserve_database_files = preserve_database_files
self.initial_tenant = initial_tenant or TenantId.generate()
self.initial_timeline = initial_timeline or TimelineId.generate()
def init_configs(self) -> NeonEnv:
# Cannot create more than one environment from one builder
@@ -475,10 +473,9 @@ class NeonEnvBuilder:
f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
)
initial_tenant, initial_timeline = env.neon_cli.create_tenant(
tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline
tenant_id=env.initial_tenant, conf=initial_tenant_conf
)
assert env.initial_tenant == initial_tenant
assert env.initial_timeline == initial_timeline
env.initial_timeline = initial_timeline
log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
return env
@@ -787,7 +784,7 @@ class NeonEnv:
# generate initial tenant ID here instead of letting 'neon init' generate it,
# so that we don't need to dig it out of the config file afterwards.
self.initial_tenant = config.initial_tenant
self.initial_timeline = config.initial_timeline
self.initial_timeline: Optional[TimelineId] = None
# Create a config file corresponding to the options
toml = textwrap.dedent(

View File

@@ -233,19 +233,10 @@ if TYPE_CHECKING:
def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
response = list_prefix(neon_env_builder, prefix)
keys = response["KeyCount"]
objects = response.get("Contents", [])
if keys != 0 and len(objects) == 0:
# this has been seen in one case with mock_s3:
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
# looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
common_prefixes = response.get("CommonPrefixes", [])
log.warn(
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
)
assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
objects = response.get("Contents")
assert (
response["KeyCount"] == 0
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
@@ -324,4 +315,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = {
def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15
return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 10

View File

@@ -1,3 +1,4 @@
import shutil
import time
from dataclasses import dataclass
from typing import Dict, Tuple
@@ -13,7 +14,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import wait_for_upload_queue_empty
from fixtures.remote_storage import RemoteStorageKind
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
@@ -137,14 +138,22 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
# initial tenant will not be present on this pageserver
env = neon_env_builder.init_configs()
env.start()
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
# allow because we are invoking this manually; we always warn on executing disk based eviction
env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
# remove the initial tenant
assert env.initial_timeline
pageserver_http.tenant_detach(env.initial_tenant)
assert isinstance(env.remote_storage, LocalFsStorage)
tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
assert tenant_remote_storage.is_dir()
shutil.rmtree(tenant_remote_storage)
env.initial_tenant = TenantId("0" * 32)
env.initial_timeline = None
# Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers.
# Large count of layers and small layer size is good for testing because it makes evictions predictable.
# Predictable in the sense that many layer evictions will be required to reach the eviction target, because

View File

@@ -11,7 +11,8 @@ from fixtures.neon_fixtures import (
wait_for_last_flush_lsn,
)
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TimelineId
from fixtures.types import TenantId, TimelineId
from fixtures.utils import query_scalar
# Test configuration
#
@@ -70,11 +71,13 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
# Disable pitr, because here we want to test branch creation after GC
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
env.neon_cli.create_branch("test_gc_aggressive", "main")
endpoint = env.endpoints.create_start("test_gc_aggressive")
log.info("postgres is running on test_gc_aggressive branch")
with endpoint.cursor() as cur:
timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
# Create table, and insert the first 100 rows
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
@@ -106,8 +109,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
)
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
env.neon_cli.create_branch("test_gc_index_upload", "main")
endpoint = env.endpoints.create_start("test_gc_index_upload")
pageserver_http = env.pageserver.http_client()
@@ -115,6 +117,9 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""

View File

@@ -12,8 +12,13 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
# test anyway, so it doesn't need any special attention here.
@pytest.mark.timeout(600)
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start(
initial_tenant_conf={
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
tenant_id, _ = env.neon_cli.create_tenant(
conf={
"gc_period": "10 s",
"gc_horizon": f"{1024 ** 2}",
"checkpoint_distance": f"{1024 ** 2}",
@@ -24,11 +29,6 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
"image_creation_threshold": "2",
}
)
pageserver_http = env.pageserver.http_client()
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
tenant_id = env.initial_tenant
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
connstr = endpoint.connstr(options="-csynchronous_commit=off")
pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
@@ -39,4 +39,5 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
with pytest.raises(subprocess.SubprocessError):
pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
env.pageserver.stop()
env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})
env.pageserver.start()
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))

View File

@@ -74,9 +74,9 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder):
cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid")
# Check layer file sizes
timeline_path = "{}/tenants/{}/timelines/{}/".format(
env.repo_dir, env.initial_tenant, env.initial_timeline
)
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id)
for filename in os.listdir(timeline_path):
if filename.startswith("00000"):
log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}")

View File

@@ -8,7 +8,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import Lsn
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar
@@ -34,8 +34,8 @@ def test_basic_eviction(
client = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# Create a number of layers in the tenant
with endpoint.cursor() as cur:

View File

@@ -18,7 +18,8 @@ from fixtures.neon_fixtures import (
)
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TenantId
from fixtures.types import TenantId, TimelineId
from fixtures.utils import query_scalar
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@@ -114,13 +115,15 @@ def test_metric_collection(
# Order of fixtures shutdown is not specified, and if http server gets down
# before pageserver, pageserver log might contain such errors in the end.
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_metric_collection")
env.neon_cli.create_branch("test_metric_collection")
endpoint = env.endpoints.create_start("test_metric_collection")
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""

View File

@@ -78,8 +78,8 @@ def test_ondemand_download_large_rel(
client = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
# We want to make sure that the data is large enough that the keyspace is partitioned.
num_rows = 1000000
@@ -183,8 +183,8 @@ def test_ondemand_download_timetravel(
client = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
lsns = []
@@ -342,8 +342,8 @@ def test_download_remote_layers_api(
client = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
table_len = 10000
with endpoint.cursor() as cur:
@@ -369,7 +369,7 @@ def test_download_remote_layers_api(
filled_current_physical = get_api_current_physical_size()
log.info(filled_current_physical)
filled_size = get_resident_physical_size()
log.info(f"filled_size: {filled_size}")
log.info(filled_size)
assert filled_current_physical == filled_size, "we don't yet do layer eviction"
env.pageserver.stop()
@@ -377,7 +377,7 @@ def test_download_remote_layers_api(
# remove all the layer files
# XXX only delete some of the layer files, to show that it really just downloads all the layers
for layer in (Path(env.repo_dir) / "tenants").glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer.name}")
log.info(f"unlinking layer {layer}")
layer.unlink()
# Shut down safekeepers before starting the pageserver.
@@ -403,7 +403,7 @@ def test_download_remote_layers_api(
filled_current_physical == get_api_current_physical_size()
), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
post_unlink_size = get_resident_physical_size()
log.info(f"post_unlink_size: {post_unlink_size}")
log.info(post_unlink_size)
assert (
post_unlink_size < filled_size
), "we just deleted layers and didn't cause anything to re-download them yet"
@@ -516,6 +516,7 @@ def test_compaction_downloads_on_demand_without_image_creation(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
assert timeline_id is not None
with env.endpoints.create_start("main") as endpoint:
# no particular reason to create the layers like this, but we are sure
@@ -589,6 +590,7 @@ def test_compaction_downloads_on_demand_with_image_creation(
env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
assert timeline_id is not None
pageserver_http = env.pageserver.http_client()

View File

@@ -2,7 +2,7 @@ from contextlib import closing
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.pageserver.utils import wait_for_last_record_lsn
from fixtures.types import Lsn
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar
@@ -12,21 +12,24 @@ from fixtures.utils import query_scalar
# Additionally, tests that pageserver is able to create tenants with custom configs.
def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
env = neon_env_builder.init_start(
initial_tenant_conf={
env = neon_env_builder.init_start()
tenant, _ = env.neon_cli.create_tenant(
conf={
"trace_read_requests": "true",
}
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main")
timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant)
endpoint = env.endpoints.create_start("test_trace_replay", "main", tenant)
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
cur.execute("create table t (i integer);")
cur.execute(f"insert into t values (generate_series(1,{10000}));")
cur.execute("select count(*) from t;")
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
# wait until pageserver receives that data
pageserver_http = env.pageserver.http_client()
@@ -35,5 +38,5 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
# Stop postgres so we drop the connection and flush the traces
endpoint.stop()
trace_path = env.repo_dir / "traces" / str(tenant_id) / str(timeline_id)
trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline)
assert trace_path.exists()

View File

@@ -95,12 +95,12 @@ def test_remote_storage_backup_and_restore(
client = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# Thats because of UnreliableWrapper's injected failures
env.pageserver.allowed_errors.append(
f".*failed to fetch tenant deletion mark at tenants/{tenant_id}/deleted attempt 1.*"
f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
)
checkpoint_numbers = range(1, 3)
@@ -403,7 +403,8 @@ def test_remote_timeline_client_calls_started_metric(
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
assert env.initial_timeline is not None
timeline_id: TimelineId = env.initial_timeline
client = env.pageserver.http_client()
@@ -541,7 +542,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
}
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
assert env.initial_timeline is not None
timeline_id: TimelineId = env.initial_timeline
timeline_path = env.timeline_dir(tenant_id, timeline_id)
@@ -806,7 +808,8 @@ def test_compaction_delete_before_upload(
)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
assert env.initial_timeline is not None
timeline_id: TimelineId = env.initial_timeline
client = env.pageserver.http_client()

View File

@@ -48,11 +48,6 @@ def test_tenant_delete_smoke(
env = neon_env_builder.init_start()
# lucky race with stopping from flushing a layer we fail to schedule any uploads
env.pageserver.allowed_errors.append(
".*layer flush task.+: could not flush frozen layer: update_metadata_file"
)
ps_http = env.pageserver.http_client()
# first try to delete non existing tenant
@@ -292,8 +287,9 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
)
# TODO resume deletion (https://github.com/neondatabase/neon/issues/5006)
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_tenant_delete_is_resumed_on_attach(
def test_deleted_tenant_ignored_on_attach(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
pg_bin: PgBin,
@@ -335,8 +331,6 @@ def test_tenant_delete_is_resumed_on_attach(
(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# From deletion polling
f".*NotFound: tenant {env.initial_tenant}.*",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
@@ -382,17 +376,20 @@ def test_tenant_delete_is_resumed_on_attach(
env.pageserver.start()
# now we call attach
ps_http.tenant_attach(tenant_id=tenant_id)
with pytest.raises(
PageserverApiException, match="Tenant is marked as deleted on remote storage"
):
ps_http.tenant_attach(tenant_id=tenant_id)
# delete should be resumed
wait_tenant_status_404(ps_http, tenant_id, iterations)
# delete should be resumed (not yet)
# wait_tenant_status_404(ps_http, tenant_id, iterations)
# we shouldn've created tenant dir on disk
tenant_path = env.tenant_dir(tenant_id=tenant_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():
assert_prefix_empty(
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(

View File

@@ -463,8 +463,8 @@ def test_detach_while_attaching(
client = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
@@ -615,8 +615,8 @@ def test_ignored_tenant_download_missing_layers(
pageserver_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
@@ -679,10 +679,10 @@ def test_ignored_tenant_stays_broken_without_metadata(
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.endpoints.create_start("main")
endpoint = env.endpoints.create_start("main")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
@@ -723,9 +723,9 @@ def test_load_attach_negatives(
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.endpoints.create_start("main")
endpoint = env.endpoints.create_start("main")
tenant_id = env.initial_tenant
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
@@ -773,8 +773,8 @@ def test_ignore_while_attaching(
pageserver_http = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.

View File

@@ -142,8 +142,8 @@ def test_tenants_attached_after_download(
client = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
# Thats because of UnreliableWrapper's injected failures
env.pageserver.allowed_errors.append(
@@ -252,8 +252,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
pageserver_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
with endpoint.cursor() as cur:
cur.execute("CREATE TABLE t1 AS VALUES (123, 'foobar');")

Some files were not shown because too many files have changed in this diff Show More